-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from comhar/example-notebook
add example notebook
- Loading branch information
Showing
3 changed files
with
278 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,278 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "90d2e6ca-4c5b-471f-830a-f469a303197b", | ||
"metadata": {}, | ||
"source": [ | ||
"### Chat with your PDFs using byaldi + Claude 🚀\n", | ||
"This notebook runs through the following examples:\n", | ||
"- academic paper\n", | ||
"- financial report" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c0518e52-b011-4421-87ca-32e17e7b2761", | ||
"metadata": {}, | ||
"source": [ | ||
"### Setup\n", | ||
"- Follow the byaldi setup instructions [here](https://github.com/AnswerDotAI/byaldi/)\n", | ||
"- pip install claudette" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"id": "1ab91a5b-8a54-473e-a316-e15f02aaedfa", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Verbosity is set to 1 (active). Pass verbose=0 to make quieter.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/home/byaldi/env/lib/python3.10/site-packages/transformers/models/paligemma/configuration_paligemma.py:137: FutureWarning: The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.\n", | ||
" warnings.warn(\n", | ||
"Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00, 1.05s/it]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import base64\n", | ||
"import os\n", | ||
"from byaldi import RAGMultiModalModel\n", | ||
"from claudette import *\n", | ||
"\n", | ||
"os.environ[\"HF_TOKEN\"] = \"YOUR_HF_TOKEN\" # to download the ColPali model\n", | ||
"os.environ[\"ANTHROPIC_API_KEY\"] = \"ANTHROPIC_API_KEY\"\n", | ||
"model = RAGMultiModalModel.from_pretrained(\"vidore/colpali\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "dfe9ded4-d8d3-4eed-929e-e3aee3a9861d", | ||
"metadata": {}, | ||
"source": [ | ||
"### Academic Paper" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "014e932d-b9b2-4dd0-ae1d-1355db06eca5", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Added page 1 of document 0 to index.\n", | ||
"Added page 2 of document 0 to index.\n", | ||
"Added page 3 of document 0 to index.\n", | ||
"Added page 4 of document 0 to index.\n", | ||
"Added page 5 of document 0 to index.\n", | ||
"Added page 6 of document 0 to index.\n", | ||
"Added page 7 of document 0 to index.\n", | ||
"Added page 8 of document 0 to index.\n", | ||
"Added page 9 of document 0 to index.\n", | ||
"Added page 10 of document 0 to index.\n", | ||
"Added page 11 of document 0 to index.\n", | ||
"Added page 12 of document 0 to index.\n", | ||
"Added page 13 of document 0 to index.\n", | ||
"Added page 14 of document 0 to index.\n", | ||
"Added page 15 of document 0 to index.\n", | ||
"Index exported to .byaldi/attention\n", | ||
"Index exported to .byaldi/attention\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Let's create the index\n", | ||
"model.index(\n", | ||
" input_path=\"attention.pdf\",\n", | ||
" index_name=\"attention\",\n", | ||
" store_collection_with_index=True,\n", | ||
" overwrite=True\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"id": "30d6488c-ba09-4d0c-ae70-409c08d0f866", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Doc ID: 0, Page: 8, Score: 23.375\n", | ||
"Doc ID: 0, Page: 9, Score: 22.625\n", | ||
"Doc ID: 0, Page: 2, Score: 20.875\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/markdown": [ | ||
"According to the table in the image, the BLEU score for the Transformer (base model) is:\n", | ||
"\n", | ||
"- 27.3 for EN-DE (English to German)\n", | ||
"- 38.1 for EN-FR (English to French)\n", | ||
"\n", | ||
"<details>\n", | ||
"\n", | ||
"- id: `msg_01HmK5iN2JtLnnmTeMs587mw`\n", | ||
"- content: `[{'text': 'According to the table in the image, the BLEU score for the Transformer (base model) is:\\n\\n- 27.3 for EN-DE (English to German)\\n- 38.1 for EN-FR (English to French)', 'type': 'text'}]`\n", | ||
"- model: `claude-3-5-sonnet-20240620`\n", | ||
"- role: `assistant`\n", | ||
"- stop_reason: `end_turn`\n", | ||
"- stop_sequence: `None`\n", | ||
"- type: `message`\n", | ||
"- usage: `{'input_tokens': 1522, 'output_tokens': 58, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}`\n", | ||
"\n", | ||
"</details>" | ||
], | ||
"text/plain": [ | ||
"Message(id='msg_01HmK5iN2JtLnnmTeMs587mw', content=[TextBlock(text='According to the table in the image, the BLEU score for the Transformer (base model) is:\\n\\n- 27.3 for EN-DE (English to German)\\n- 38.1 for EN-FR (English to French)', type='text')], model='claude-3-5-sonnet-20240620', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=In: 1522; Out: 58; Total: 1580)" | ||
] | ||
}, | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"query = \"What's the BLEU score for the transfomer base model?\"\n", | ||
"results = model.search(query, k=3)\n", | ||
"for result in results:\n", | ||
" print(f\"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}\")\n", | ||
" \n", | ||
"chat = Chat(models[1]) # let's use sonnet\n", | ||
"image_bytes = base64.b64decode(results[0].base64)\n", | ||
"chat([image_bytes, query])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "3f771e13-55d5-463d-aba1-f20b073978a6", | ||
"metadata": {}, | ||
"source": [ | ||
"### Financial Report" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"id": "65c2a13c-ee4e-4b45-8eb9-f466b5b2b93d", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Added page 1 of document 1 to index.\n", | ||
"Added page 2 of document 1 to index.\n", | ||
"Added page 3 of document 1 to index.\n", | ||
"Added page 4 of document 1 to index.\n", | ||
"Added page 5 of document 1 to index.\n", | ||
"Added page 6 of document 1 to index.\n", | ||
"Index exported to .byaldi/financial_report\n", | ||
"Index exported to .byaldi/financial_report\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Let's create the index\n", | ||
"model.index(\n", | ||
" input_path=\"financial_report.pdf\",\n", | ||
" index_name=\"financial_report\",\n", | ||
" store_collection_with_index=True,\n", | ||
" overwrite=True\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"id": "a3d9fec7-b262-41a4-a3e4-82b7c7356983", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Doc ID: 1, Page: 4, Score: 20.375\n", | ||
"Doc ID: 1, Page: 6, Score: 18.625\n", | ||
"Doc ID: 1, Page: 5, Score: 18.5\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/markdown": [ | ||
"According to the bar graph showing monthly revenue for Product C, the month with the highest revenue was June. The bar for June is visibly the tallest, reaching above 2500 on the revenue scale, indicating it generated the most revenue compared to all other months shown.\n", | ||
"\n", | ||
"<details>\n", | ||
"\n", | ||
"- id: `msg_01FEU91Kwi8PfLGpqvrycM8n`\n", | ||
"- content: `[{'text': 'According to the bar graph showing monthly revenue for Product C, the month with the highest revenue was June. The bar for June is visibly the tallest, reaching above 2500 on the revenue scale, indicating it generated the most revenue compared to all other months shown.', 'type': 'text'}]`\n", | ||
"- model: `claude-3-5-sonnet-20240620`\n", | ||
"- role: `assistant`\n", | ||
"- stop_reason: `end_turn`\n", | ||
"- stop_sequence: `None`\n", | ||
"- type: `message`\n", | ||
"- usage: `{'input_tokens': 1573, 'output_tokens': 59, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}`\n", | ||
"\n", | ||
"</details>" | ||
], | ||
"text/plain": [ | ||
"Message(id='msg_01FEU91Kwi8PfLGpqvrycM8n', content=[TextBlock(text='According to the bar graph showing monthly revenue for Product C, the month with the highest revenue was June. The bar for June is visibly the tallest, reaching above 2500 on the revenue scale, indicating it generated the most revenue compared to all other months shown.', type='text')], model='claude-3-5-sonnet-20240620', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=In: 1573; Out: 59; Total: 1632)" | ||
] | ||
}, | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"query = \"In which month did Product C generate the most revenue?\"\n", | ||
"results = model.search(query, k=3)\n", | ||
"for result in results:\n", | ||
" print(f\"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}\")\n", | ||
" \n", | ||
"chat = Chat(models[1]) # let's use sonnet\n", | ||
"image_bytes = base64.b64decode(results[0].base64)\n", | ||
"chat([image_bytes, query])" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "byaldi", | ||
"language": "python", | ||
"name": "byaldi" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.14" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Binary file not shown.