diff --git a/docs/docs/api_reference/extractors/documentcontext.md b/docs/docs/api_reference/extractors/documentcontext.md new file mode 100644 index 0000000000000..58a468e2e14fc --- /dev/null +++ b/docs/docs/api_reference/extractors/documentcontext.md @@ -0,0 +1,4 @@ +::: llama_index.extractors + options: + members: + - DocumentContextExtractor diff --git a/docs/docs/examples/data/paul_graham/paul_graham_essay_ambiguated.txt b/docs/docs/examples/data/paul_graham/paul_graham_essay_ambiguated.txt new file mode 100644 index 0000000000000..c4b9ab3fabaa1 --- /dev/null +++ b/docs/docs/examples/data/paul_graham/paul_graham_essay_ambiguated.txt @@ -0,0 +1,42 @@ +Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. They were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep. The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The district's machine happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. The space was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights. The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the reader and press a button to load the code into memory and run it. The result would ordinarily be to print something on the spectacularly loud device. I was puzzled by the machine. I couldn't figure out what to do with it. And in retrospect there's not much I could have done with it. The only form of input to programs was data stored on cards, and I didn't have any information stored on them. The only other option was to do things that didn't rely on any input, like calculate approximations of pi, but I didn't know enough math to do anything interesting of that type. So I'm not surprised I can't remember any code I wrote, because it can't have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn't. On a machine without time-sharing, this was a social as well as a technical error, as the manager's expression made clear. With microcomputers, everything changed. Now you could have one sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punched inputs and then stopping. + +I shifted to writing essays again, and created several new ones over the next few months. Some even ventured beyond startup topics. Then in March 2015 I began working on Lisp again. +Lisp's unique characteristic is that its core is a language defined by writing an interpreter in itself. It wasn't originally intended as a standard programming language. It was created as a formal model of computation, an alternative to the Turing machine. If you want to write an interpreter for a language in itself, what's the minimum set of predefined operators do you need? The Lisp that John McCarthy invented, or more accurately discovered, is an answer to that question. +McCarthy didn't realize the language could even be used to program computers until his grad student Steve Russell suggested it. Russell translated McCarthy's interpreter into IBM 704 machine language, and from then on Lisp also became a programming language in the conventional sense. But its origins as a model of computation gave it a power and elegance that other languages couldn't match. This quality was what attracted me in college, though I didn't understand why at the time. +McCarthy's 1960 version did nothing more than interpret Lisp expressions. It was missing many features you'd want in a programming language. So these had to be added, and when they were, they weren't defined using his original axiomatic approach. That wouldn't have been feasible at the time. McCarthy tested his interpreter by hand-simulating the execution of programs. But it was already getting close to the limit of interpreters you could test that way — indeed, there was a bug in it that he had overlooked. To test a more complicated system, you'd have had to run it, and computers then weren't powerful enough. + +Now they are powerful enough. Now you could continue using the axiomatic approach till you'd defined a complete programming language. And as long as every change you made to the original system was a discoveredness-preserving transformation, you could, in principle, end up with a complete language that had this quality. Harder to do than to talk about, of course, but if it was possible in principle, why not try? So I decided to take a shot at it. The work took 4 years, from March 26, 2015 to October 12, 2019. It was fortunate that I had a precisely defined goal, or it would have been hard to keep at it for so long. +I wrote this new Lisp, called Bel, in itself in Arc. That may sound like a contradiction, but it's an indication of the sort of trickery I had to engage in to make this work. By means of an egregious collection of hacks I managed to make something close enough to an interpreter written in itself that could actually run. Not fast, but fast enough to test. +I had to ban myself from writing essays during most of this time, or I'd never have finished. In late 2015 I spent 3 months writing essays, and when I went back to working on Bel I could barely understand the code. Not so much because it was badly written as because the problem is so convoluted. When you're working on an interpreter written in itself, it's hard to keep track of what's happening at what level, and errors can be practically encrypted by the time you get them. +So I said no more writing till the project was done. But I told few people about it while I was working on it. So for years it must have seemed that I was doing nothing, when in fact I was working harder than I'd ever worked on anything. Occasionally after wrestling for hours with some gruesome bug I'd check Twitter or HN and see someone asking "Does Paul Graham still code?" + +Working on the language was hard but satisfying. I worked on it so intensively that at any given time I had a decent chunk of the code in my head and could write more there. I remember taking the boys to the coast on a sunny day in 2015 and figuring out how to deal with some problem involving continuations while I watched them play in the tide pools. This experience felt like I was doing life right. I remember that moment because I was slightly dismayed at how novel it felt. The good news is that I had more moments like this over the next few years. +In the summer of 2016 we moved to England. We wanted our kids to see what it was like living in another country, and since I was a British citizen by birth, that country seemed the obvious choice. We only meant to stay for a year, but we liked it so much that we still live there. So most of the work was written in England. +In the fall of 2019, Bel was finally finished. Like McCarthy's original version, it was a spec rather than an implementation, although like McCarthy's work it's a spec expressed as code. +Now that I could write essays again, I wrote a bunch about topics I'd had stacked up. I kept writing through 2020, but I also started to think about other things I could work on. How should I choose what to do? Well, how had I chosen what to work on in the past? I wrote an essay for myself to answer that question, and I was surprised how long and messy the answer turned out to be. If this surprised me, who'd lived it, then I thought perhaps it would be interesting to other people, and encouraging to those with similarly messy lives. So I wrote a more detailed version for others to read, and this is the last sentence of it. + +[1] My experience skipped a step in the evolution of computers: time-sharing machines with interactive OSes. I went straight from batch processing to microcomputers, which made the latter seem all the more exciting. +[2] Italian words for abstract concepts can nearly always be predicted from their English cognates (except for occasional traps like polluzione). It's the everyday words that differ. So if you string together a lot of abstract concepts with a few simple verbs, you can make a little Italian go a long way. +[3] I lived at Piazza San Felice 4, so my walk to the Accademia went straight down the spine of old Florence: past the Pitti, across the bridge, past Orsanmichele, between the Duomo and the Baptistery, and then up Via Ricasoli to Piazza San Marco. I saw the city at street level in every possible condition, from empty dark winter evenings to sweltering summer days when the streets were packed with tourists. +[4] You can of course paint people like still lives if you want to, and they're willing. That sort of portrait is arguably the apex of still life painting, though the long sitting does tend to produce pained expressions in the sitters. +[5] Interleaf was one of many companies that had smart people and built impressive technology, and yet got crushed by Moore's Law. In the 1990s the exponential growth in the power of commodity (i.e. Intel) processors rolled up high-end, special-purpose hardware and software companies like a bulldozer. +[6] The signature style seekers at RISD weren't specifically mercenary. In the art world, money and coolness are tightly coupled. Anything expensive comes to be seen as fashionable, and anything seen as trendy will soon become equally costly. +[7] Technically the apartment wasn't rent-controlled but rent-stabilized, but this is a refinement only New Yorkers would know or care about. The point is that the place was really cheap, less than half market price. +[8] Most software you can launch as soon as it's done. But when the software is an online store builder and you're hosting the stores, if you don't have any users yet, that fact will be painfully obvious. So before we could launch publicly we had to launch privately, in the sense of recruiting an initial set of users and making sure they had decent-looking shops. +[9] We'd had a code editor in Viaweb for users to define their own page styles. They didn't know it, but they were editing Lisp expressions underneath. But this wasn't an app editor, because the code ran when the merchants' sites were generated, not when shoppers visited them. +[10] This was the first instance of what is now a familiar experience, and so was what happened next, when I read the comments and found they were full of angry people. How could I claim that Lisp was better than other languages? Weren't they all Turing complete? People who see the responses to essays I write sometimes tell me how sorry they feel for me, but I'm not exaggerating when I reply that things have always been like this, since the very beginning. It comes with the territory. An essay must tell readers things they don't already know, and some people dislike being told such information. +Continuing with the notes: +[11] People put plenty of stuff on the internet in the 90s of course, but putting something online is not the same as publishing it online. Publishing online means you treat the online version as the (or at least a) primary version. +[12] There is a general lesson here that our experience with Y Combinator also teaches: Customs continue to constrain you long after the restrictions that caused them have disappeared. Customary VC practice had once, like the customs about publishing essays, been based on real constraints. Startups had once been much more expensive to start, and proportionally rare. Now they could be cheap and common, but the VCs' customs still reflected the old world, just as customs about writing essays still reflected the constraints of the print era. +Which in turn implies that people who are independent-minded (i.e. less influenced by custom) will have an advantage in fields affected by rapid change (where customs are more likely to be obsolete). +Here's an interesting point, though: you can't always predict which fields will be affected by rapid change. Obviously software and venture capital will be, but who would have predicted that essay writing would be? +[13] Y Combinator was not the original name. At first we were called Cambridge Seed. But we didn't want a regional name, in case someone copied us in Silicon Valley, so we renamed ourselves after one of the coolest tricks in the lambda calculus, the Y combinator. +I picked orange as our color partly because it's the warmest, and partly because no VC used it. In 2005 all the VCs used staid colors like maroon, navy blue, and forest green, because they were trying to appeal to LPs, not founders. The YC logo itself is an inside joke: the Viaweb logo had been a white V on a red circle, so I made the new one a white Y on an orange square. +[14] YC did become a fund for a couple years starting in 2009, because it was getting so big I could no longer afford to fund it personally. But after Heroku got bought we had enough money to go back to being self-funded. +[15] I've never liked the term "deal flow," because it implies that the number of new startups at any given time is fixed. This assumption is not only false, but it's the purpose of YC to falsify it, by causing startups to be founded that would not otherwise have existed. +[16] She reports that the air conditioners were all different shapes and sizes, because there was a run on them and she had to get whatever she could, but that they were all heavier than she could carry now. +[17] Another problem with HN was a bizarre edge case that occurs when you both write essays and run a forum. When you run a forum, you're assumed to see if not every conversation, at least every conversation involving you. And when you write essays, people post highly imaginative misinterpretations of them on forums. Individually these two phenomena are tedious but bearable, but the combination is disastrous. You actually have to respond to the misinterpretations, because the assumption that you're present in the conversation means that not responding to any sufficiently upvoted criticism reads as a tacit admission that it's correct. But that response in turn encourages more; anyone who wants to pick a fight with you senses that now is their chance. +[18] The worst thing about leaving YC was not working with Jessica anymore. We'd been working on the company almost the whole time we'd known each other, and we'd neither tried nor wanted to separate it from our personal lives, so leaving was like pulling up a deeply rooted tree. +[19] One way to get more precise about the concept of invented vs discovered is to talk about space aliens. Any sufficiently advanced alien civilization would certainly know about the Pythagorean theorem, for example. I believe, though with less certainty, that they would also know about the Lisp in McCarthy's 1960 paper. +But if so there's no reason to suppose that this is the limit of the language that might be known to them. Presumably aliens need numbers and errors and I/O too. So it seems likely there exists at least one path out of McCarthy's Lisp along which discoveredness is preserved. +Thanks to Trevor Blackwell, John Collison, Patrick Collison, Daniel Gackle, Ralph Hazell, Jessica Livingston, Robert Morris, and Harj Taggar for reading drafts of this. diff --git a/docs/docs/examples/metadata_extraction/DocumentContextExtractor.ipynb b/docs/docs/examples/metadata_extraction/DocumentContextExtractor.ipynb new file mode 100644 index 0000000000000..be470bdb703ce --- /dev/null +++ b/docs/docs/examples/metadata_extraction/DocumentContextExtractor.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Contextual Retrieval With Llama Index\n", + "\n", + "This notebook covers contextual retrieval with llama_index DocumentContextExtractor\n", + "\n", + "Based on an Anthropic [blost post](https://www.anthropic.com/news/contextual-retrieval), the concept is to:\n", + "1. Use an LLM to generate a 'context' for each chunk based on the entire document\n", + "2. embed the chunk + context together\n", + "3. reap the benefits of higher RAG accuracy\n", + "\n", + "While you can also do this manually, the DocumentContextExtractor offers a lot of convenience and error handling, plus you can integrate it into your llama_index pipelines! Let's get started.\n", + "\n", + "NOTE: This notebook costs about $0.02 everytime you run it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Install Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index\n", + "%pip install llama-index-readers-file\n", + "%pip install llama-index-embeddings-huggingface\n", + "%pip install llama-index-llms-openai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup an LLM\n", + "You can use the MockLLM or you can use a real LLM of your choice here. flash 2 and gpt-4o-mini work well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.llms.openai import OpenAI\n", + "from llama_index.core import Settings\n", + "\n", + "OPENAI_API_KEY = \"sk-...\"\n", + "llm = OpenAI(model=\"gpt-4o-mini\", api_key=OPENAI_API_KEY)\n", + "Settings.llm = llm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Setup a data pipeline\n", + "\n", + " we'll need an embedding model, an index store, a vectore store, and a way to split tokens." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build Pipeline & Index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/loganmarkewich/Library/Caches/pypoetry/virtualenvs/llama-index-caVs7DDe-py3.10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from llama_index.core import VectorStoreIndex, StorageContext\n", + "from llama_index.core.node_parser import TokenTextSplitter\n", + "from llama_index.core.storage.docstore.simple_docstore import (\n", + " SimpleDocumentStore,\n", + ")\n", + "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", + "\n", + "# Initialize document store and embedding model\n", + "docstore = SimpleDocumentStore()\n", + "embed_model = HuggingFaceEmbedding(model_name=\"baai/bge-small-en-v1.5\")\n", + "\n", + "# Create storage contexts\n", + "storage_context = StorageContext.from_defaults(docstore=docstore)\n", + "storage_context_no_extra_context = StorageContext.from_defaults()\n", + "text_splitter = TokenTextSplitter(\n", + " separator=\" \", chunk_size=256, chunk_overlap=10\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### DocumentContextExtractor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the new part!\n", + "\n", + "from llama_index.core.extractors import DocumentContextExtractor\n", + "\n", + "context_extractor = DocumentContextExtractor(\n", + " # these 2 are mandatory\n", + " docstore=docstore,\n", + " max_context_length=128000,\n", + " # below are optional\n", + " llm=llm, # default to Settings.llm\n", + " oversized_document_strategy=\"warn\",\n", + " max_output_tokens=100,\n", + " key=\"context\",\n", + " prompt=DocumentContextExtractor.SUCCINCT_CONTEXT_PROMPT,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget \"https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay_ambiguated.txt\" -O \"paul_graham_essay_ambiguated.txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "\n", + "reader = SimpleDirectoryReader(\n", + " input_files=[\"./paul_graham_essay_ambiguated.txt\"]\n", + ")\n", + "documents = reader.load_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the pipeline, then search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 15/15 [00:07<00:00, 2.10it/s]\n" + ] + } + ], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "# need to add documents directly for the DocumentContextExtractor to work\n", + "storage_context.docstore.add_documents(documents)\n", + "index = VectorStoreIndex.from_documents(\n", + " documents=documents,\n", + " storage_context=storage_context,\n", + " embed_model=embed_model,\n", + " transformations=[text_splitter, context_extractor],\n", + ")\n", + "\n", + "index_nocontext = VectorStoreIndex.from_documents(\n", + " documents=documents,\n", + " storage_context=storage_context_no_extra_context,\n", + " embed_model=embed_model,\n", + " transformations=[text_splitter],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_question = \"Which chunks of text discuss the IBM 704?\"\n", + "retriever = index.as_retriever(similarity_top_k=2)\n", + "nodes_fromcontext = retriever.retrieve(test_question)\n", + "\n", + "retriever_nocontext = index_nocontext.as_retriever(similarity_top_k=2)\n", + "nodes_nocontext = retriever_nocontext.retrieve(test_question)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==========\n", + "NO CONTEXT\n", + "\n", + "Chunk 1:\n", + "Score: 0.5710870309825231\n", + "Content: it. The result would ordinarily be to print something on the spectacularly loud device. I was puzzled by the machine. I couldn't figure out what to do with it. And in retrospect there's not much I could have done with it. The only form of input to programs was data stored on cards, and I didn't have any information stored on them. The only other option was to do things that didn't rely on any input, like calculate approximations of pi, but I didn't know enough math to do anything interesting of that type. So I'm not surprised I can't remember any code I wrote, because it can't have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn't. On a machine without time-sharing, this was a social as well as a technical error, as the manager's expression made clear. With microcomputers, everything changed. Now you could have one sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punched inputs\n", + "\n", + "Chunk 2:\n", + "Score: 0.567587387219806\n", + "Content: McCarthy's 1960 paper.\n", + "But if so there's no reason to suppose that this is the limit of the language that might be known to them. Presumably aliens need numbers and errors and I/O too. So it seems likely there exists at least one path out of McCarthy's Lisp along which discoveredness is preserved.\n", + "Thanks to Trevor Blackwell, John Collison, Patrick Collison, Daniel Gackle, Ralph Hazell, Jessica Livingston, Robert Morris, and Harj Taggar for reading drafts of this.\n", + "==========\n", + "WITH CONTEXT\n", + "\n", + "Chunk 1:\n", + "Score: 0.6776241992281743\n", + "Content: it. The result would ordinarily be to print something on the spectacularly loud device. I was puzzled by the machine. I couldn't figure out what to do with it. And in retrospect there's not much I could have done with it. The only form of input to programs was data stored on cards, and I didn't have any information stored on them. The only other option was to do things that didn't rely on any input, like calculate approximations of pi, but I didn't know enough math to do anything interesting of that type. So I'm not surprised I can't remember any code I wrote, because it can't have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn't. On a machine without time-sharing, this was a social as well as a technical error, as the manager's expression made clear. With microcomputers, everything changed. Now you could have one sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punched inputs\n", + "\n", + "Chunk 2:\n", + "Score: 0.6200645958839048\n", + "Content: Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. They were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep. The first programs I tried writing were on the IBM 1401 that our school district used for what was then called \"data processing.\" This was in 9th grade, so I was 13 or 14. The district's machine happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. The space was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights. The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the reader and press a button to load the code into memory and run it. The result would ordinarily be to print something\n" + ] + } + ], + "source": [ + "# Print each node's content\n", + "print(\"==========\")\n", + "print(\"NO CONTEXT\")\n", + "for i, node in enumerate(nodes_nocontext, 1):\n", + " print(f\"\\nChunk {i}:\")\n", + " print(f\"Score: {node.score}\") # Similarity score\n", + " print(f\"Content: {node.node.text}\") # The actual text content\n", + "\n", + "# Print each node's content\n", + "print(\"==========\")\n", + "print(\"WITH CONTEXT\")\n", + "for i, node in enumerate(nodes_fromcontext, 1):\n", + " print(f\"\\nChunk {i}:\")\n", + " print(f\"Score: {node.score}\") # Similarity score\n", + " print(f\"Content: {node.node.text}\") # The actual text content" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/llama-index-core/llama_index/core/extractors/__init__.py b/llama-index-core/llama_index/core/extractors/__init__.py index 7860586b1f931..baaaf118491fe 100644 --- a/llama-index-core/llama_index/core/extractors/__init__.py +++ b/llama-index-core/llama_index/core/extractors/__init__.py @@ -6,6 +6,7 @@ SummaryExtractor, TitleExtractor, ) +from llama_index.core.extractors.document_context import DocumentContextExtractor __all__ = [ "SummaryExtractor", @@ -14,4 +15,5 @@ "KeywordExtractor", "BaseExtractor", "PydanticProgramExtractor", + "DocumentContextExtractor", ] diff --git a/llama-index-core/llama_index/core/extractors/document_context.py b/llama-index-core/llama_index/core/extractors/document_context.py new file mode 100644 index 0000000000000..a611462518bc9 --- /dev/null +++ b/llama-index-core/llama_index/core/extractors/document_context.py @@ -0,0 +1,334 @@ +import asyncio +import logging +import random +from functools import lru_cache +from typing import ( + Any, + ClassVar, + Coroutine, + Dict, + List, + Literal, + Optional, + Sequence, + Set, + Union, +) +from typing_extensions import TypeGuard + +from llama_index.core import Settings +from llama_index.core.async_utils import DEFAULT_NUM_WORKERS, run_jobs +from llama_index.core.extractors import BaseExtractor +from llama_index.core.llms import ChatMessage, ChatResponse, ImageBlock, LLM, TextBlock +from llama_index.core.schema import BaseNode, Node, TextNode +from llama_index.core.storage.docstore.simple_docstore import DocumentStore + + +def is_text_node(node: BaseNode) -> TypeGuard[Union[Node, TextNode]]: + return isinstance(node, (Node, TextNode)) + + +OversizeStrategy = Literal["warn", "error", "ignore"] + + +# original context prompt from the Anthropic cookbook/blogpost, works well +ORIGINAL_CONTEXT_PROMPT: str = """ +Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. +Answer only with the succinct context and nothing else. +""" + +# miniaturized context prompt, generates better results, produces more keyword-laden results for better matches +SUCCINCT_CONTEXT_PROMPT: str = """ +Generate keywords and brief phrases describing the main topics, entities, and actions in this text. Replace pronouns with their specific referents. Disambiguate pronouns and ambiguous terms in the chunk. Format as comma-separated phrases. Exclude meta-commentary about the text itself. +""" + + +class DocumentContextExtractor(BaseExtractor): + """ + An LLM-based context extractor for enhancing RAG accuracy through document analysis. + + ! Nodes that already have the 'key' in node.metadata will NOT be processed - will be skipped ! + + This extractor processes documents and their nodes to generate contextual metadata, + implementing the approach described in the Anthropic "Contextual Retrieval" blog post. + It handles rate limits, document size constraints, and parallel processing of nodes. + + Attributes: + llm (LLM): Language model instance for generating context + docstore (DocumentStore): Storage for parent documents + key (str): Metadata key for storing extracted context + prompt (str): Prompt template for context generation + doc_ids (Set[str]): Set of processed document IDs + max_context_length (int): Maximum allowed document context length + max_output_tokens (int): Maximum tokens in generated context + oversized_document_strategy (OversizeStrategy): Strategy for handling large documents + + Example: + ```python + extractor = DocumentContextExtractor( + docstore=my_docstore, + llm=my_llm, + max_context_length=64000, + max_output_tokens=256 + ) + metadata_list = await extractor.aextract(nodes) + ``` + """ + + # Pydantic fields + llm: LLM + docstore: DocumentStore + key: str + prompt: str + doc_ids: Set[str] + max_context_length: int + max_output_tokens: int + oversized_document_strategy: OversizeStrategy + num_workers: int = DEFAULT_NUM_WORKERS + + ORIGINAL_CONTEXT_PROMPT: ClassVar[str] = ORIGINAL_CONTEXT_PROMPT + SUCCINCT_CONTEXT_PROMPT: ClassVar[str] = SUCCINCT_CONTEXT_PROMPT + + DEFAULT_KEY: str = "context" + + def __init__( + self, + docstore: DocumentStore, + llm: Optional[LLM] = None, + max_context_length: int = 1000, + key: str = DEFAULT_KEY, + prompt: str = ORIGINAL_CONTEXT_PROMPT, + num_workers: int = DEFAULT_NUM_WORKERS, + max_output_tokens: int = 512, + oversized_document_strategy: OversizeStrategy = "warn", + **kwargs: Any, + ) -> None: + """Init params.""" + assert hasattr( + llm, "achat" + ) # not all LLMs have this, particularly the huggingfaceapi ones. + + super().__init__( + llm=llm or Settings.llm, + docstore=docstore, + key=key, + prompt=prompt, + doc_ids=set(), + max_context_length=max_context_length, + max_output_tokens=max_output_tokens, + oversized_document_strategy=oversized_document_strategy, + num_workers=num_workers, + **kwargs, + ) + + # this can take a surprisingly long time on longer docs so we cache it. For oversized docs, we end up counting twice, the 2nd time without the cache. + # but if you're repeateddly running way oversize docs, the time that takes won't be what matters anyways. + @staticmethod + @lru_cache(maxsize=1000) + def _count_tokens(text: str) -> int: + """ + This can take a surprisingly long time on longer docs so we cache it, and we need to call it on every doc, regardless of size. + """ + encoder = Settings.tokenizer + tokens = encoder(text) + return len(tokens) + + async def _agenerate_node_context( + self, + node: Union[Node, TextNode], + metadata: Dict, + document: Union[Node, TextNode], + prompt: str, + key: str, + ) -> Dict: + """ + Generate context for a node using LLM with retry logic. + + Implements exponential backoff for rate limit handling and uses prompt + caching when available. The function retries on rate limits. + + Args: + node: Node to generate context for + metadata: Metadata dictionary to update + document: Parent document containing the node + prompt: Prompt template for context generation + key: Metadata key for storing generated context + + Returns: + Updated metadata dictionary with generated context + + Note: + Uses exponential backoff starting at 60 seconds with up to 5 retries + for rate limit handling. + """ + cached_text = f"{document.get_content()}" + messages = [ + ChatMessage( + role="user", + content=[ + TextBlock( + text=cached_text, + type="text", + ) + ], + additional_kwargs={"cache_control": {"type": "ephemeral"}}, + ), + ChatMessage( + role="user", + content=[ + TextBlock( + text=f"Here is the chunk we want to situate within the whole document:\n{node.get_content()}\n{prompt}", + type="text", + ) + ], + ), + ] + + max_retries = 5 + base_delay = 60 + + for attempt in range(max_retries): + try: + # Extra headers typically dont cause issues + headers = {"anthropic-beta": "prompt-caching-2024-07-31"} + + response: ChatResponse = await self.llm.achat( + messages, max_tokens=self.max_output_tokens, extra_headers=headers + ) + + first_block: Union[TextBlock, ImageBlock] = response.message.blocks[0] + if isinstance(first_block, TextBlock): + metadata[key] = first_block.text + else: + logging.warning( + f"Received non-text block type: {type(first_block)}" + ) + return metadata + + except Exception as e: + is_rate_limit = any( + message in str(e).lower() + for message in ["rate limit", "too many requests", "429"] + ) + + if is_rate_limit and attempt < max_retries - 1: + delay = (base_delay * (2**attempt)) + (random.random() * 0.5) + logging.warning( + f"Rate limit hit, retrying in {delay:.1f} seconds " + f"(attempt {attempt + 1}/{max_retries})" + ) + await asyncio.sleep(delay) + continue + + if is_rate_limit: + logging.error( + f"Failed after {max_retries} retries due to rate limiting" + ) + else: + logging.warning( + f"Error generating context for node {node.node_id}: {e}", + exc_info=True, + ) + return metadata + + return metadata + + async def _get_document(self, doc_id: str) -> Optional[Union[Node, TextNode]]: + """Counting tokens can be slow, as can awaiting the docstore (potentially), so we keep a small lru_cache.""" + # first we need to get the document + try: + doc = await self.docstore.aget_document(doc_id) + except ValueError as e: + if "not found" in str(e): + logging.warning(f"Document {doc_id} not found in docstore") + return None + if not doc: + logging.warning(f"Document {doc_id} not found in docstore") + return None + if not is_text_node(doc): + logging.warning(f"Document {doc_id} is not an instance of (TextNode, Node)") + return None + + # then truncate if necessary. + if self.max_context_length is not None: + strategy = self.oversized_document_strategy + token_count = self._count_tokens(doc.get_content()) + if token_count > self.max_context_length: + message = ( + f"Document {doc.node_id} is too large ({token_count} tokens) " + f"to be processed. Doc metadata: {doc.metadata}" + ) + + if strategy == "warn": + logging.warning(message) + elif strategy == "error": + raise ValueError(message) + elif strategy == "ignore": + pass + else: + raise ValueError(f"Unknown oversized document strategy: {strategy}") + + return doc + + async def aextract(self, nodes: Sequence[BaseNode]) -> List[Dict]: + """ + Extract context for multiple nodes asynchronously, optimized for loosely ordered nodes. + Processes each node independently without guaranteeing sequential document handling. + Nodes will be *mostly* processed in document-order assuming nodes get passed in document-order. + + Args: + nodes: List of nodes to process, ideally grouped by source document + + Returns: + List of metadata dictionaries with generated context + """ + metadata_list: List[Dict] = [] + for _ in nodes: + metadata_list.append({}) + metadata_map = { + node.node_id: metadata_dict + for metadata_dict, node in zip(metadata_list, nodes) + } + + # sorting takes a tiny amount of time - 0.4s for 1_000_000 nodes. but 1_000_000 nodes takes potentially hours to process + # considering sorting CAN save the users hundreds of dollars in API costs, we just sort and leave no option to do otherwise. + # The math always works out in the user's favor and we can't guarantee things are sorted in the first place. + sorted_nodes = sorted( + nodes, key=lambda n: n.source_node.node_id if n.source_node else "" + ) + + # iterate over all the nodes and generate the jobs + node_tasks: List[Coroutine[Any, Any, Any]] = [] + for node in sorted_nodes: + if not (node.source_node and is_text_node(node)): + continue + + # Skip already processed nodes + if self.key in node.metadata: + continue + + doc: Optional[Union[Node, TextNode]] = await self._get_document( + node.source_node.node_id + ) + if not doc: + continue + + metadata = metadata_map[node.node_id] + # this modifies metadata in-place, adding a new key to the dictionary - we needed do anytyhing with the return value + task = self._agenerate_node_context( + node, metadata, doc, self.prompt, self.key + ) + node_tasks.append(task) + + # then run the jobs - this does return the metadata list, but we already have it + await run_jobs( + node_tasks, + show_progress=self.show_progress, + workers=self.num_workers, + ) + + return metadata_list + + +if __name__ == "__main__": + print(DocumentContextExtractor.ORIGINAL_CONTEXT_PROMPT) diff --git a/llama-index-core/llama_index/core/schema.py b/llama-index-core/llama_index/core/schema.py index 9ef8b58fe6c59..1aaf8bc4906d2 100644 --- a/llama-index-core/llama_index/core/schema.py +++ b/llama-index-core/llama_index/core/schema.py @@ -1010,8 +1010,16 @@ def __init__(self, **data: Any) -> None: if "text" in data: text = data.pop("text") if "text_resource" in data: - msg = "'text' is deprecated and 'text_resource' will be used instead" - logging.warning(msg) + text_resource = ( + data["text_resource"] + if isinstance(data["text_resource"], MediaResource) + else MediaResource.model_validate(data["text_resource"]) + ) + if (text_resource.text or "").strip() != text.strip(): + msg = ( + "'text' is deprecated and 'text_resource' will be used instead" + ) + logging.warning(msg) else: data["text_resource"] = MediaResource(text=text) diff --git a/llama-index-core/tests/extractors/BUILD b/llama-index-core/tests/extractors/BUILD new file mode 100644 index 0000000000000..57341b1358b56 --- /dev/null +++ b/llama-index-core/tests/extractors/BUILD @@ -0,0 +1,3 @@ +python_tests( + name="tests", +) diff --git a/llama-index-core/tests/extractors/test_document_context_extractor.py b/llama-index-core/tests/extractors/test_document_context_extractor.py new file mode 100644 index 0000000000000..e7e63b1e1d506 --- /dev/null +++ b/llama-index-core/tests/extractors/test_document_context_extractor.py @@ -0,0 +1,171 @@ +import pytest + +from llama_index.core.extractors import DocumentContextExtractor +from llama_index.core.llms import ChatMessage, ChatResponse, MockLLM +from llama_index.core.schema import Document, NodeRelationship, TextNode +from llama_index.core.storage.docstore.simple_docstore import SimpleDocumentStore + + +@pytest.fixture() +def mock_llm(): + class CustomMockLLM(MockLLM): + def chat(self, messages, **kwargs): + return ChatResponse( + message=ChatMessage( + role="assistant", + blocks=[ + { + "text": f"Context for the provided chunk", + "block_type": "text", + } + ], + ) + ) + + return CustomMockLLM() + + +@pytest.fixture() +def sample_documents(): + return [ + Document( + text="This is chapter 1. It contains important information. This is a test document.", + metadata={"title": "Doc 1"}, + ), + Document( + text="Chapter 2 builds on previous concepts. It introduces new ideas. More test content here.", + metadata={"title": "Doc 2"}, + ), + ] + + +@pytest.fixture() +def create_text_nodes(): + def _create_nodes(document, texts): + doc_info = document.as_related_node_info() + return [ + TextNode( + text=text, + metadata={}, + relationships={NodeRelationship.SOURCE: doc_info}, + ) + for text in texts + ] + + return _create_nodes + + +@pytest.fixture() +def docstore(sample_documents): + docstore = SimpleDocumentStore() + for doc in sample_documents: + docstore.add_documents([doc]) + return docstore + + +@pytest.fixture() +def context_extractor(docstore, mock_llm): + return DocumentContextExtractor( + docstore=docstore, + llm=mock_llm, + max_context_length=1000, + max_output_tokens=100, + oversized_document_strategy="error", + ) + + +@pytest.mark.asyncio() +async def test_context_extraction_basic( + context_extractor, sample_documents, create_text_nodes +): + doc = sample_documents[0] + nodes = create_text_nodes( + doc, ["This is chapter 1.", "It contains important information."] + ) + + try: + metadata_list = await context_extractor.aextract(nodes) + print("METADATA LIST: ", metadata_list) + + if metadata_list is None: + raise ValueError("context_extractor.aextract() returned None") + + assert len(metadata_list) == len(nodes) + for metadata in metadata_list: + assert "context" in metadata + assert metadata["context"] == "Context for the provided chunk" + + except Exception as e: + print(f"Error during extraction: {e!s}") + raise + + +def test_invalid_oversized_strategy(): + with pytest.raises(ValueError): + DocumentContextExtractor( + docstore=SimpleDocumentStore(), + llm=MockLLM(), + max_context_length=1000, + max_output_tokens=100, + oversized_document_strategy="invalid_strategy", + ) + + +@pytest.mark.asyncio() +async def test_context_extraction_oversized_document(create_text_nodes): + large_doc = Document( + text="This is a very long document. " * 1000, metadata={"title": "Large Doc"} + ) + + docstore = SimpleDocumentStore() + docstore.add_documents([large_doc]) + + extractor = DocumentContextExtractor( + docstore=docstore, + llm=MockLLM(), + max_context_length=100, # Small limit to trigger error + max_output_tokens=50, + oversized_document_strategy="error", + ) + + nodes = create_text_nodes(large_doc, ["This is a test chunk."]) + + with pytest.raises(ValueError): + await extractor.aextract(nodes) + + +@pytest.mark.asyncio() +async def test_context_extraction_custom_prompt( + docstore, mock_llm, sample_documents, create_text_nodes +): + custom_prompt = "Generate a detailed context for this chunk:" + extractor = DocumentContextExtractor( + docstore=docstore, + llm=mock_llm, + prompt=DocumentContextExtractor.ORIGINAL_CONTEXT_PROMPT, + max_context_length=1000, + max_output_tokens=100, + ) + + nodes = create_text_nodes(sample_documents[0], ["Test chunk"]) + + metadata_list = await extractor.aextract(nodes) + assert len(metadata_list) == 1 + assert "context" in metadata_list[0] + + +@pytest.mark.asyncio() +async def test_multiple_documents_context( + context_extractor, sample_documents, create_text_nodes +): + # Create nodes from different documents + nodes = create_text_nodes( + sample_documents[0], ["This is chapter 1."] + ) + create_text_nodes( + sample_documents[1], ["Chapter 2 builds on previous concepts."] + ) + + metadata_list = await context_extractor.aextract(nodes) + assert len(metadata_list) == 2 + for metadata in metadata_list: + assert "context" in metadata diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py index 14e9722ad737f..2fa750589401e 100644 --- a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py @@ -267,7 +267,7 @@ def _get_query_embedding(self, query: str) -> List[float]: Returns: List[float]: numpy array of embeddings """ - return self._embed(query, prompt_name="query") + return self._embed([query], prompt_name="query")[0] async def _aget_query_embedding(self, query: str) -> List[float]: """ @@ -303,7 +303,7 @@ def _get_text_embedding(self, text: str) -> List[float]: Returns: List[float]: numpy array of embeddings """ - return self._embed(text, prompt_name="text") + return self._embed([text], prompt_name="text")[0] def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]: """ diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/pyproject.toml b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/pyproject.toml index 6d52a9b4e5dae..3142c011924b5 100644 --- a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/pyproject.toml +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/pyproject.toml @@ -28,7 +28,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-embeddings-huggingface" readme = "README.md" -version = "0.5.0" +version = "0.5.1" [tool.poetry.dependencies] python = ">=3.9,<4.0"