Merge pull request #1 from comhar/example-notebook

add example notebook
AnswerDotAI · Sep 5, 2024 · 427a859 · 427a859
2 parents 051c534 + 48a88e9
commit 427a859
Show file tree

Hide file tree

Showing 3 changed files with 278 additions and 0 deletions.
diff --git a/examples/attention.pdf b/examples/attention.pdf
diff --git a/examples/example.ipynb b/examples/example.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "90d2e6ca-4c5b-471f-830a-f469a303197b",
+   "metadata": {},
+   "source": [
+    "### Chat with your PDFs using byaldi + Claude 🚀\n",
+    "This notebook runs through the following examples:\n",
+    "- academic paper\n",
+    "- financial report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0518e52-b011-4421-87ca-32e17e7b2761",
+   "metadata": {},
+   "source": [
+    "### Setup\n",
+    "- Follow the byaldi setup instructions [here](https://github.com/AnswerDotAI/byaldi/)\n",
+    "- pip install claudette"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "1ab91a5b-8a54-473e-a316-e15f02aaedfa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Verbosity is set to 1 (active). Pass verbose=0 to make quieter.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/byaldi/env/lib/python3.10/site-packages/transformers/models/paligemma/configuration_paligemma.py:137: FutureWarning: The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.\n",
+      "  warnings.warn(\n",
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import base64\n",
+    "import os\n",
+    "from byaldi import RAGMultiModalModel\n",
+    "from claudette import *\n",
+    "\n",
+    "os.environ[\"HF_TOKEN\"] = \"YOUR_HF_TOKEN\" # to download the ColPali model\n",
+    "os.environ[\"ANTHROPIC_API_KEY\"] = \"ANTHROPIC_API_KEY\"\n",
+    "model = RAGMultiModalModel.from_pretrained(\"vidore/colpali\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfe9ded4-d8d3-4eed-929e-e3aee3a9861d",
+   "metadata": {},
+   "source": [
+    "### Academic Paper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "014e932d-b9b2-4dd0-ae1d-1355db06eca5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Added page 1 of document 0 to index.\n",
+      "Added page 2 of document 0 to index.\n",
+      "Added page 3 of document 0 to index.\n",
+      "Added page 4 of document 0 to index.\n",
+      "Added page 5 of document 0 to index.\n",
+      "Added page 6 of document 0 to index.\n",
+      "Added page 7 of document 0 to index.\n",
+      "Added page 8 of document 0 to index.\n",
+      "Added page 9 of document 0 to index.\n",
+      "Added page 10 of document 0 to index.\n",
+      "Added page 11 of document 0 to index.\n",
+      "Added page 12 of document 0 to index.\n",
+      "Added page 13 of document 0 to index.\n",
+      "Added page 14 of document 0 to index.\n",
+      "Added page 15 of document 0 to index.\n",
+      "Index exported to .byaldi/attention\n",
+      "Index exported to .byaldi/attention\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Let's create the index\n",
+    "model.index(\n",
+    "    input_path=\"attention.pdf\",\n",
+    "    index_name=\"attention\",\n",
+    "    store_collection_with_index=True,\n",
+    "    overwrite=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "30d6488c-ba09-4d0c-ae70-409c08d0f866",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Doc ID: 0, Page: 8, Score: 23.375\n",
+      "Doc ID: 0, Page: 9, Score: 22.625\n",
+      "Doc ID: 0, Page: 2, Score: 20.875\n"
+     ]
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "According to the table in the image, the BLEU score for the Transformer (base model) is:\n",
+       "\n",
+       "- 27.3 for EN-DE (English to German)\n",
+       "- 38.1 for EN-FR (English to French)\n",
+       "\n",
+       "<details>\n",
+       "\n",
+       "- id: `msg_01HmK5iN2JtLnnmTeMs587mw`\n",
+       "- content: `[{'text': 'According to the table in the image, the BLEU score for the Transformer (base model) is:\\n\\n- 27.3 for EN-DE (English to German)\\n- 38.1 for EN-FR (English to French)', 'type': 'text'}]`\n",
+       "- model: `claude-3-5-sonnet-20240620`\n",
+       "- role: `assistant`\n",
+       "- stop_reason: `end_turn`\n",
+       "- stop_sequence: `None`\n",
+       "- type: `message`\n",
+       "- usage: `{'input_tokens': 1522, 'output_tokens': 58, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}`\n",
+       "\n",
+       "</details>"
+      ],
+      "text/plain": [
+       "Message(id='msg_01HmK5iN2JtLnnmTeMs587mw', content=[TextBlock(text='According to the table in the image, the BLEU score for the Transformer (base model) is:\\n\\n- 27.3 for EN-DE (English to German)\\n- 38.1 for EN-FR (English to French)', type='text')], model='claude-3-5-sonnet-20240620', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=In: 1522; Out: 58; Total: 1580)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query = \"What's the BLEU score for the transfomer base model?\"\n",
+    "results = model.search(query, k=3)\n",
+    "for result in results:\n",
+    "    print(f\"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}\")\n",
+    "    \n",
+    "chat = Chat(models[1]) # let's use sonnet\n",
+    "image_bytes = base64.b64decode(results[0].base64)\n",
+    "chat([image_bytes, query])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3f771e13-55d5-463d-aba1-f20b073978a6",
+   "metadata": {},
+   "source": [
+    "### Financial Report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "65c2a13c-ee4e-4b45-8eb9-f466b5b2b93d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Added page 1 of document 1 to index.\n",
+      "Added page 2 of document 1 to index.\n",
+      "Added page 3 of document 1 to index.\n",
+      "Added page 4 of document 1 to index.\n",
+      "Added page 5 of document 1 to index.\n",
+      "Added page 6 of document 1 to index.\n",
+      "Index exported to .byaldi/financial_report\n",
+      "Index exported to .byaldi/financial_report\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Let's create the index\n",
+    "model.index(\n",
+    "    input_path=\"financial_report.pdf\",\n",
+    "    index_name=\"financial_report\",\n",
+    "    store_collection_with_index=True,\n",
+    "    overwrite=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a3d9fec7-b262-41a4-a3e4-82b7c7356983",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Doc ID: 1, Page: 4, Score: 20.375\n",
+      "Doc ID: 1, Page: 6, Score: 18.625\n",
+      "Doc ID: 1, Page: 5, Score: 18.5\n"
+     ]
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "According to the bar graph showing monthly revenue for Product C, the month with the highest revenue was June. The bar for June is visibly the tallest, reaching above 2500 on the revenue scale, indicating it generated the most revenue compared to all other months shown.\n",
+       "\n",
+       "<details>\n",
+       "\n",
+       "- id: `msg_01FEU91Kwi8PfLGpqvrycM8n`\n",
+       "- content: `[{'text': 'According to the bar graph showing monthly revenue for Product C, the month with the highest revenue was June. The bar for June is visibly the tallest, reaching above 2500 on the revenue scale, indicating it generated the most revenue compared to all other months shown.', 'type': 'text'}]`\n",
+       "- model: `claude-3-5-sonnet-20240620`\n",
+       "- role: `assistant`\n",
+       "- stop_reason: `end_turn`\n",
+       "- stop_sequence: `None`\n",
+       "- type: `message`\n",
+       "- usage: `{'input_tokens': 1573, 'output_tokens': 59, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}`\n",
+       "\n",
+       "</details>"
+      ],
+      "text/plain": [
+       "Message(id='msg_01FEU91Kwi8PfLGpqvrycM8n', content=[TextBlock(text='According to the bar graph showing monthly revenue for Product C, the month with the highest revenue was June. The bar for June is visibly the tallest, reaching above 2500 on the revenue scale, indicating it generated the most revenue compared to all other months shown.', type='text')], model='claude-3-5-sonnet-20240620', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=In: 1573; Out: 59; Total: 1632)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query = \"In which month did Product C generate the most revenue?\"\n",
+    "results = model.search(query, k=3)\n",
+    "for result in results:\n",
+    "    print(f\"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}\")\n",
+    "    \n",
+    "chat = Chat(models[1]) # let's use sonnet\n",
+    "image_bytes = base64.b64decode(results[0].base64)\n",
+    "chat([image_bytes, query])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "byaldi",
+   "language": "python",
+   "name": "byaldi"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/financial_report.pdf b/examples/financial_report.pdf