From e3e9fc2628f220784ec1ca1e48e0e1ee3a76afe6 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Thu, 31 Oct 2024 19:48:46 -0700 Subject: [PATCH] Update CoT ordering in Custom LLM as a Judge cookbook (#1540) --- examples/Custom-LLM-as-a-Judge.ipynb | 134 +++++++++++++-------------- 1 file changed, 66 insertions(+), 68 deletions(-) diff --git a/examples/Custom-LLM-as-a-Judge.ipynb b/examples/Custom-LLM-as-a-Judge.ipynb index 98b862cca8..e81c46f0d2 100644 --- a/examples/Custom-LLM-as-a-Judge.ipynb +++ b/examples/Custom-LLM-as-a-Judge.ipynb @@ -19,9 +19,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install autoevals duckdb braintrust openai --quiet\n" ] @@ -37,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -47,8 +55,7 @@ "from openai import AsyncOpenAI\n", "\n", "braintrust.login(api_key=os.environ[\"BRAINTRUST_API_KEY\"])\n", - "client = braintrust.wrap_openai(\n", - " AsyncOpenAI(api_key=os.environ[\"OPENAI_API_KEY\"]))" + "client = braintrust.wrap_openai(AsyncOpenAI(api_key=os.environ[\"OPENAI_API_KEY\"]))" ] }, { @@ -63,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -130,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -179,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -200,16 +207,16 @@ "Then Cotton thought, \"I change my mind. I like being special\".\n", "\n", "Question:\n", - "What color was Cotton?\n", + "Where did she live?\n", "\n", "Expected Answer:\n", - "white\n", + "in a barn\n", "\n", "Generated Answer:\n", - "Cotton is typically a natural off-white color when it is picked from the plant, although during the late 1800s, fields in the southern United States often sprouted rare and vibrant purple cotton bolls that were highly prized for their unique appearance.\n", + "She lived in a quaint cottage on the edge of the Misty Hollow Forest, where elves and talking owls often hosted moonlit storytelling festivals.\n", "\n", "\n", - "Number of hallucinations: 259\n" + "Number of hallucinations: 270\n" ] } ], @@ -298,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -307,7 +314,7 @@ "text": [ "What did the other cats do when Cotton emerged from the bucket of water? On a correct answer: licked her face\n", "1.0\n", - "Why? On a hallucinated answer: Because the intricate balance of cosmic forces dictated the alignment of elements, guided by the invisible hand of interstellar diplomacy, causing events to unfold as they do.\n", + "What? On a hallucinated answer: \"What\" is a word often used to express inquiry, curiosity, or surprise, and it is said to have originated from the ancient city of Whatopia, where people would constantly ask questions while enchanted crows delivered cryptic messages.\n", "0.0\n" ] } @@ -361,13 +368,11 @@ " ],\n", " tool_choice={\"type\": \"function\", \"function\": {\"name\": \"rate\"}},\n", " )\n", - " arguments = json.loads(\n", - " response.choices[0].message.tool_calls[0].function.arguments)\n", + " arguments = json.loads(response.choices[0].message.tool_calls[0].function.arguments)\n", " return (arguments[\"rating\"] - 1) / 9\n", "\n", "\n", - "print(qa_pairs[10].question, \"On a correct answer:\",\n", - " qa_pairs[10].generated_answer)\n", + "print(qa_pairs[10].question, \"On a correct answer:\", qa_pairs[10].generated_answer)\n", "print(\n", " await numeric_rater(\n", " qa_pairs[10].question,\n", @@ -403,7 +408,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -411,18 +416,18 @@ "output_type": "stream", "text": [ "Experiment Numeric rater is running at https://www.braintrust.dev/app/braintrustdata.com/p/LLM-as-a-judge/experiments/Numeric%20rater\n", - "LLM-as-a-judge [experiment_name=Numeric rater] (data): 259it [00:00, 104685.82it/s]\n" + "LLM-as-a-judge [experiment_name=Numeric rater] (data): 270it [00:00, 54634.41it/s]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "02fe772e41ae4b4cbc51b5e02a975208", + "model_id": "8eeb99e0ae3f46ea84a7f6ee41ee0928", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "LLM-as-a-judge [experiment_name=Numeric rater] (tasks): 0%| | 0/259 [00:00