diff --git a/docs/sphinx/source/query.ipynb b/docs/sphinx/source/query.ipynb index 263747cd..b8f4ece5 100644 --- a/docs/sphinx/source/query.ipynb +++ b/docs/sphinx/source/query.ipynb @@ -37,11 +37,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "!pip3 install pyvespa" + "#!pip3 install pyvespa" ] }, { @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -149,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -164,7 +164,7 @@ " 'cord_uid': 'xej338lo'}}]" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -182,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -191,7 +191,7 @@ "['xej338lo']" ] }, - "execution_count": 11, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -210,16 +210,16 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'timing': {'querytime': 0.005, 'summaryfetchtime': 0.0, 'searchtime': 0.007},\n", + "{'timing': {'querytime': 0.005, 'summaryfetchtime': 0.0, 'searchtime': 0.006},\n", " 'root': {'id': 'toplevel',\n", " 'relevance': 1.0,\n", - " 'fields': {'totalCount': 2390},\n", + " 'fields': {'totalCount': 2399},\n", " 'coverage': {'coverage': 100,\n", " 'documents': 976355,\n", " 'full': True,\n", @@ -235,7 +235,7 @@ " 'cord_uid': 'xej338lo'}}]}}" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -263,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -303,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -378,18 +378,18 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'force_compression': 0.5579209327697754,\n", - " 'auto': 0.7328271865844727,\n", - " 'no_compression': 0.45219922065734863}" + "{'force_compression': 0.6606230735778809,\n", + " 'auto': 0.48363542556762695,\n", + " 'no_compression': 0.8949570655822754}" ] }, - "execution_count": 47, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -422,7 +422,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -434,15 +434,15 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Total time: 1.73 seconds\n", - "QPS: 57.77\n" + "Total time: 2.66 seconds\n", + "QPS: 37.57\n" ] } ], @@ -492,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -501,33 +501,33 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'timing': {'querytime': 0.003, 'summaryfetchtime': 0.0, 'searchtime': 0.004},\n", + "{'timing': {'querytime': 0.004, 'summaryfetchtime': 0.0, 'searchtime': 0.005},\n", " 'root': {'id': 'toplevel',\n", " 'relevance': 1.0,\n", - " 'fields': {'totalCount': 2444},\n", + " 'fields': {'totalCount': 2384},\n", " 'coverage': {'coverage': 100,\n", " 'documents': 976355,\n", " 'full': True,\n", " 'nodes': 2,\n", " 'results': 1,\n", " 'resultsFull': 1},\n", - " 'children': [{'id': 'id:covid-19:doc::779001',\n", - " 'relevance': 27.517448178754492,\n", + " 'children': [{'id': 'id:covid-19:doc::534720',\n", + " 'relevance': 26.6769101612402,\n", " 'source': 'content',\n", - " 'fields': {'title': 'Cost utility analysis of Remdesivir and Dexamethasone treatment for hospitalised COVID-19 patients - a hypothetical study',\n", - " 'abstract': ': Sars-Cov-2 is a novel corona virus associated with significant morbidity and mortality. Remdesivir and Dexamethasone are two treatments that have shown to be effective against the Sars-Cov-2 associated disease. However, a cost-effectiveness analysis of the two treatments is still lacking. OBJECTIVE: The cost-utility of Remdesivir, Dexamethasone and a simultaneous use of the two drugs with respect to standard of care for treatment Covid-19 hospitalized patients is evaluated, together with the effect',\n", - " 'documentid': 'id:covid-19:doc::779001',\n", - " 'cord_uid': 'ysml5abq'}}]},\n", - " 'time': 1.4157278537750244}" + " 'fields': {'title': 'A Review on Remdesivir: A Possible Promising Agent for the Treatment of COVID-19',\n", + " 'abstract': 'manufacturing of specific therapeutics and vaccines to treat COVID-19 are time-consuming processes. At this time, using available conventional therapeutics along with other treatment options may be useful to fight COVID-19. In different clinical trials, efficacy of remdesivir (GS-5734) against Ebola virus has been demonstrated. Moreover, remdesivir may be an effective therapy in vitro and in animal models infected by SARS and MERS coronaviruses. Hence, the drug may be theoretically effective against SARS-CoV-2. Remdesivir',\n", + " 'documentid': 'id:covid-19:doc::534720',\n", + " 'cord_uid': 'xej338lo'}}]},\n", + " 'time': 0.5464229583740234}" ] }, - "execution_count": 49, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -538,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -573,10 +573,10 @@ " \n", " 0\n", " 1\n", + " 0.005\n", " 0.004\n", - " 0.003\n", " 0.000\n", - " 1.415728\n", + " 0.546423\n", " \n", " \n", " 1\n", @@ -584,31 +584,31 @@ " 0.005\n", " 0.004\n", " 0.000\n", - " 1.067308\n", + " 0.631488\n", " \n", " \n", " 2\n", " 3\n", - " 0.009\n", + " 0.010\n", " 0.007\n", - " 0.001\n", - " 1.415624\n", + " 0.002\n", + " 2.335938\n", " \n", " \n", " 3\n", " 4\n", - " 0.011\n", - " 0.010\n", + " 0.007\n", + " 0.006\n", " 0.000\n", - " 1.069153\n", + " 2.335876\n", " \n", " \n", " 4\n", " 5\n", - " 0.010\n", - " 0.008\n", + " 0.007\n", + " 0.005\n", " 0.001\n", - " 1.505080\n", + " 2.167304\n", " \n", " \n", " ...\n", @@ -621,42 +621,42 @@ " \n", " 95\n", " 96\n", - " 0.033\n", - " 0.012\n", - " 0.020\n", - " 1.659568\n", + " 0.015\n", + " 0.005\n", + " 0.010\n", + " 2.599611\n", " \n", " \n", " 96\n", " 97\n", - " 0.043\n", - " 0.020\n", - " 0.021\n", - " 1.599375\n", + " 0.044\n", + " 0.011\n", + " 0.031\n", + " 2.598901\n", " \n", " \n", " 97\n", " 98\n", - " 0.017\n", - " 0.005\n", - " 0.011\n", - " 1.621481\n", + " 0.023\n", + " 0.007\n", + " 0.015\n", + " 2.602370\n", " \n", " \n", " 98\n", " 99\n", - " 0.023\n", - " 0.011\n", - " 0.011\n", - " 1.615766\n", + " 0.054\n", + " 0.026\n", + " 0.025\n", + " 2.603086\n", " \n", " \n", " 99\n", " 100\n", - " 0.050\n", - " 0.025\n", - " 0.025\n", - " 1.602700\n", + " 0.031\n", + " 0.015\n", + " 0.015\n", + " 2.603938\n", " \n", " \n", "\n", @@ -665,22 +665,22 @@ ], "text/plain": [ " hits search_time query_time summary_time total_time\n", - "0 1 0.004 0.003 0.000 1.415728\n", - "1 2 0.005 0.004 0.000 1.067308\n", - "2 3 0.009 0.007 0.001 1.415624\n", - "3 4 0.011 0.010 0.000 1.069153\n", - "4 5 0.010 0.008 0.001 1.505080\n", + "0 1 0.005 0.004 0.000 0.546423\n", + "1 2 0.005 0.004 0.000 0.631488\n", + "2 3 0.010 0.007 0.002 2.335938\n", + "3 4 0.007 0.006 0.000 2.335876\n", + "4 5 0.007 0.005 0.001 2.167304\n", ".. ... ... ... ... ...\n", - "95 96 0.033 0.012 0.020 1.659568\n", - "96 97 0.043 0.020 0.021 1.599375\n", - "97 98 0.017 0.005 0.011 1.621481\n", - "98 99 0.023 0.011 0.011 1.615766\n", - "99 100 0.050 0.025 0.025 1.602700\n", + "95 96 0.015 0.005 0.010 2.599611\n", + "96 97 0.044 0.011 0.031 2.598901\n", + "97 98 0.023 0.007 0.015 2.602370\n", + "98 99 0.054 0.026 0.025 2.603086\n", + "99 100 0.031 0.015 0.015 2.603938\n", "\n", "[100 rows x 5 columns]" ] }, - "execution_count": 47, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -719,9 +719,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'code': 12, 'summary': 'Timed out', 'source': 'content', 'message': \"Error in execution of chain 'content': Chain timed out.\"}]\n" + ] + } + ], "source": [ "with app.syncio(connections=12) as session:\n", " try:\n", @@ -747,9 +755,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'code': 3, 'summary': 'Illegal query', 'source': 'content', 'message': 'No query'}]\n" + ] + } + ], "source": [ "with app.syncio(connections=12) as session:\n", " try:\n", @@ -760,30 +776,923 @@ " except VespaError as e:\n", " print(str(e))" ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.11.4 64-bit", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.19" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using the Querybuilder DSL API\n", + "\n", + "In pyvespa, we provide a Domain Specific Language (DSL) that allows you to build queries programmatically in the `vespa.querybuilder`-module. See [reference](https://pyvespa.readthedocs.io/en/latest/reference-api.html#vespa-querybuilder) for full details. There are also many examples in our tests:\n", + "\n", + "- https://github.com/vespa-engine/pyvespa/blob/master/tests/unit/test_grouping.py\n", + "- https://github.com/vespa-engine/pyvespa/blob/master/tests/unit/test_qb.py\n", + "- https://github.com/vespa-engine/pyvespa/blob/master/tests/integration/test_integration_grouping.py\n", + "- https://github.com/vespa-engine/pyvespa/blob/master/tests/integration/test_integration_queries.py\n", + "\n", + "This section demonstrates common query patterns using the querybuilder DSL. All features of the Vespa Query Language are supported by the querybuilder DSL.\n", + "\n", + "
\n", + " Using the Querybuilder DSL is completely optional, and you can always use the Vespa Query Language directly by passing the query as a string, which might be more convenient for simple queries.\n", + "
\n", + "\n", + "We will use our own [documentation search](https://search.vespa.ai/) app for the following examples.\n", + "For details of the app configuration, see the corresponding [github repository](https://github.com/vespa-cloud/vespa-documentation-search)." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "app = Vespa(url=\"https://api.search.vespa.ai\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 1 - matches, order by and limit\n", + "\n", + "We want to find the 10 documents with the most terms in the 'pyvespa'-namespace (the documentation search has a 'namespace'-field, which refers to the source of the documentation). Note that the documentation search operates on the 'paragraph'-schema, but for demo purposes, we will use the 'document'-schema." + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: select title, path, term_count from doc where namespace matches \"pyvespa\" order by term_count desc limit 10\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pathtitleterm_count
0/examples/feed_performance.htmlFeeding performance74798
1/reference-api.htmlReference API18282
2/examples/simplified-retrieval-with-colpali-vl...Scaling ColPALI (VLM) Retrieval13943
3/examples/pdf-retrieval-with-ColQwen2-vlm_Vesp...PDF-Retrieval using ColQWen2 (ColPali) with Vespa12939
4/examples/colpali-document-retrieval-vision-la...Vespa 🤝 ColPali: Efficient Document Retrieval ...12666
5/examples/colpali-benchmark-vqa-vlm_Vespa-clou...ColPali Ranking Experiments on DocVQA11707
6/examples/multi-vector-indexing.htmlMulti-vector indexing with HNSW7907
7/examples/billion-scale-vector-search-with-coh...Billion-scale vector search with Cohere binary...6531
8/examples/visual_pdf_rag_with_vespa_colpali_cl...Visual PDF RAG with Vespa - ColPali demo appli...5666
9/examples/chat_with_your_pdfs_using_colbert_la...Chat with your pdfs with ColBERT, langchain, a...5628
\n", + "
" + ], + "text/plain": [ + " path \\\n", + "0 /examples/feed_performance.html \n", + "1 /reference-api.html \n", + "2 /examples/simplified-retrieval-with-colpali-vl... \n", + "3 /examples/pdf-retrieval-with-ColQwen2-vlm_Vesp... \n", + "4 /examples/colpali-document-retrieval-vision-la... \n", + "5 /examples/colpali-benchmark-vqa-vlm_Vespa-clou... \n", + "6 /examples/multi-vector-indexing.html \n", + "7 /examples/billion-scale-vector-search-with-coh... \n", + "8 /examples/visual_pdf_rag_with_vespa_colpali_cl... \n", + "9 /examples/chat_with_your_pdfs_using_colbert_la... \n", + "\n", + " title term_count \n", + "0 Feeding performance 74798 \n", + "1 Reference API 18282 \n", + "2 Scaling ColPALI (VLM) Retrieval 13943 \n", + "3 PDF-Retrieval using ColQWen2 (ColPali) with Vespa 12939 \n", + "4 Vespa 🤝 ColPali: Efficient Document Retrieval ... 12666 \n", + "5 ColPali Ranking Experiments on DocVQA 11707 \n", + "6 Multi-vector indexing with HNSW 7907 \n", + "7 Billion-scale vector search with Cohere binary... 6531 \n", + "8 Visual PDF RAG with Vespa - ColPali demo appli... 5666 \n", + "9 Chat with your pdfs with ColBERT, langchain, a... 5628 " + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import vespa.querybuilder as qb\n", + "from vespa.querybuilder import QueryField\n", + "\n", + "namespace = QueryField(\"namespace\")\n", + "q = (\n", + " qb.select([\"title\", \"path\", \"term_count\"])\n", + " .from_(\"doc\")\n", + " .where(\n", + " namespace.matches(\"pyvespa\")\n", + " ) # matches is regex-match, see https://docs.vespa.ai/en/reference/query-language-reference.html#matches\n", + " .order_by(\"term_count\", ascending=False)\n", + " .set_limit(10)\n", + ")\n", + "print(f\"Query: {q}\")\n", + "resp = app.query(yql=q)\n", + "results = [hit[\"fields\"] for hit in resp.hits]\n", + "df = pd.DataFrame(results)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 2 - timestamp range, contains\n", + "\n", + "We want to find the documents where one of the indexed fields contains the query term `embedding`,is updated after Jan 1st 2024 and 1 hour ago, and have the documents ranked the 'documentation' rank profile. See https://github.com/vespa-cloud/vespa-documentation-search/blob/main/src/main/application/schemas/doc.sd. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "From: 1704063600, To: 1736769106\n", + "Query: select title, last_updated, content from doc where namespace matches \"op.*\" and range(last_updated, 1704063600, 1736769106) and weakAnd(title contains \"embedding\", content contains \"embedding\", headers contains \"embedding\", path contains \"embedding\") limit 3\n" + ] + } + ], + "source": [ + "import vespa.querybuilder as qb\n", + "from vespa.querybuilder import QueryField\n", + "from datetime import datetime\n", + "\n", + "queryterm = \"embedding\"\n", + "\n", + "# We need to instantiate a QueryField for fields that we want to call methods on\n", + "last_updated = QueryField(\"last_updated\")\n", + "title = QueryField(\"title\")\n", + "headers = QueryField(\"headers\")\n", + "path = QueryField(\"path\")\n", + "namespace = QueryField(\"namespace\")\n", + "content = QueryField(\"content\")\n", + "\n", + "from_ts = int(datetime(2024, 1, 1).timestamp())\n", + "to_ts = int(datetime.now().timestamp()) - 60 * 60 # 1 hour ago\n", + "print(f\"From: {from_ts}, To: {to_ts}\")\n", + "q = (\n", + " qb.select(\n", + " [title, last_updated, content]\n", + " ) # Select takes either a list of QueryField or strings, (or '*' for all fields)\n", + " .from_(\"doc\")\n", + " .where(\n", + " namespace.matches(\"op.*\")\n", + " & last_updated.in_range(from_ts, to_ts) # could also use > and <\n", + " & qb.weakAnd(\n", + " title.contains(queryterm),\n", + " content.contains(queryterm),\n", + " headers.contains(queryterm),\n", + " path.contains(queryterm),\n", + " )\n", + " )\n", + " .set_limit(3)\n", + ")\n", + "print(f\"Query: {q}\")\n", + "resp = app.query(yql=q, ranking=\"documentation\")" + ] }, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
content<sep />similar data by finding nearby points i...Reference configuration for <hi>embedders</hi>...<sep /> basic news search application - applic...
titleEmbeddingEmbedding ReferenceNews search and recommendation tutorial - embe...
last_updated173650542217365054221736505423
idindex:documentation/0/5d6e77ca20d4e8ee29716747index:documentation/1/a03c4aef22fcde916804d3d9index:documentation/1/ad44f35cbd7b8214f88963e3
relevance23.5390822.39227116.853568
sourcedocumentationdocumentationdocumentation
bm25(content)2.6335582.5975882.632806
bm25(headers)7.5725968.2073285.537104
bm25(keywords)0.00.00.0
bm25(path)3.9335643.2917653.043277
bm25(title)4.6996814.1477952.820191
fieldLength(content)3830.02031.03273.0
fieldLength(title)1.02.06.0
fieldMatch(content)0.9157660.8921130.915871
fieldMatch(content).matches1.01.01.0
fieldMatch(title)1.00.9338690.842758
query(contentWeight)1.01.01.0
query(headersWeight)1.01.01.0
query(pathWeight)1.01.01.0
query(titleWeight)2.02.02.0
\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "content similar data by finding nearby points i... \n", + "title Embedding \n", + "last_updated 1736505422 \n", + "id index:documentation/0/5d6e77ca20d4e8ee29716747 \n", + "relevance 23.53908 \n", + "source documentation \n", + "bm25(content) 2.633558 \n", + "bm25(headers) 7.572596 \n", + "bm25(keywords) 0.0 \n", + "bm25(path) 3.933564 \n", + "bm25(title) 4.699681 \n", + "fieldLength(content) 3830.0 \n", + "fieldLength(title) 1.0 \n", + "fieldMatch(content) 0.915766 \n", + "fieldMatch(content).matches 1.0 \n", + "fieldMatch(title) 1.0 \n", + "query(contentWeight) 1.0 \n", + "query(headersWeight) 1.0 \n", + "query(pathWeight) 1.0 \n", + "query(titleWeight) 2.0 \n", + "\n", + " 1 \\\n", + "content Reference configuration for embedders... \n", + "title Embedding Reference \n", + "last_updated 1736505422 \n", + "id index:documentation/1/a03c4aef22fcde916804d3d9 \n", + "relevance 22.392271 \n", + "source documentation \n", + "bm25(content) 2.597588 \n", + "bm25(headers) 8.207328 \n", + "bm25(keywords) 0.0 \n", + "bm25(path) 3.291765 \n", + "bm25(title) 4.147795 \n", + "fieldLength(content) 2031.0 \n", + "fieldLength(title) 2.0 \n", + "fieldMatch(content) 0.892113 \n", + "fieldMatch(content).matches 1.0 \n", + "fieldMatch(title) 0.933869 \n", + "query(contentWeight) 1.0 \n", + "query(headersWeight) 1.0 \n", + "query(pathWeight) 1.0 \n", + "query(titleWeight) 2.0 \n", + "\n", + " 2 \n", + "content basic news search application - applic... \n", + "title News search and recommendation tutorial - embe... \n", + "last_updated 1736505423 \n", + "id index:documentation/1/ad44f35cbd7b8214f88963e3 \n", + "relevance 16.853568 \n", + "source documentation \n", + "bm25(content) 2.632806 \n", + "bm25(headers) 5.537104 \n", + "bm25(keywords) 0.0 \n", + "bm25(path) 3.043277 \n", + "bm25(title) 2.820191 \n", + "fieldLength(content) 3273.0 \n", + "fieldLength(title) 6.0 \n", + "fieldMatch(content) 0.915871 \n", + "fieldMatch(content).matches 1.0 \n", + "fieldMatch(title) 0.842758 \n", + "query(contentWeight) 1.0 \n", + "query(headersWeight) 1.0 \n", + "query(pathWeight) 1.0 \n", + "query(titleWeight) 2.0 " + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame([hit[\"fields\"] | hit for hit in resp.hits])\n", + "df = pd.concat(\n", + " [\n", + " df.drop([\"matchfeatures\", \"fields\"], axis=1),\n", + " pd.json_normalize(df[\"matchfeatures\"]),\n", + " ],\n", + " axis=1,\n", + ")\n", + "df.T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 3 - Basic grouping\n", + "\n", + "Vespa supports grouping and aggregation of matches through the Vespa grouping language. For an introduction to grouping, see https://docs.vespa.ai/en/grouping.html. \n", + "\n", + "We will use [purchase schema](https://github.com/vespa-cloud/vespa-documentation-search/blob/main/src/main/application/schemas/purchase.sd) that is also deployed in the documentation search app." + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: select * from purchase where true limit 0 | all(group(customer) each(output(sum(price))))\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
valuesum(price)
0Brown20537
1Jones39816
2Smith19484
\n", + "
" + ], + "text/plain": [ + " value sum(price)\n", + "0 Brown 20537\n", + "1 Jones 39816\n", + "2 Smith 19484" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from vespa.querybuilder import Grouping as G\n", + "\n", + "grouping = G.all(\n", + " G.group(\"customer\"),\n", + " G.each(G.output(G.sum(\"price\"))),\n", + ")\n", + "q = qb.select(\"*\").from_(\"purchase\").where(True).set_limit(0).groupby(grouping)\n", + "print(f\"Query: {q}\")\n", + "resp = app.query(yql=q)\n", + "group = resp.hits[0][\"children\"][0][\"children\"]\n", + "# get value and sum(price) into a DataFrame\n", + "df = pd.DataFrame([hit[\"fields\"] | hit for hit in group])\n", + "df = df.loc[:, [\"value\", \"sum(price)\"]]\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 4 - Nested grouping\n", + "\n", + "Let's find out how much each customer has spent per day by grouping on customer, then date: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: select * from purchase where true | all(group(customer) each(group(time.date(date)) each(output(sum(price)))))\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GroupIdDateSum(price)
0Brown2006-9-107540
1Brown2006-9-111597
2Brown2006-9-88000
3Brown2006-9-93400
4Jones2006-9-108900
5Jones2006-9-1120816
6Jones2006-9-88000
7Jones2006-9-92100
8Smith2006-9-106100
9Smith2006-9-112584
10Smith2006-9-61000
11Smith2006-9-73000
12Smith2006-9-96800
\n", + "
" + ], + "text/plain": [ + " GroupId Date Sum(price)\n", + "0 Brown 2006-9-10 7540\n", + "1 Brown 2006-9-11 1597\n", + "2 Brown 2006-9-8 8000\n", + "3 Brown 2006-9-9 3400\n", + "4 Jones 2006-9-10 8900\n", + "5 Jones 2006-9-11 20816\n", + "6 Jones 2006-9-8 8000\n", + "7 Jones 2006-9-9 2100\n", + "8 Smith 2006-9-10 6100\n", + "9 Smith 2006-9-11 2584\n", + "10 Smith 2006-9-6 1000\n", + "11 Smith 2006-9-7 3000\n", + "12 Smith 2006-9-9 6800" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from vespa.querybuilder import Grouping as G\n", + "\n", + "# First, we construct the grouping expression:\n", + "grouping = G.all(\n", + " G.group(\"customer\"),\n", + " G.each(\n", + " G.group(G.time_date(\"date\")),\n", + " G.each(\n", + " G.output(G.sum(\"price\")),\n", + " ),\n", + " ),\n", + ")\n", + "# Then, we construct the query:\n", + "q = qb.select(\"*\").from_(\"purchase\").where(True).groupby(grouping)\n", + "print(f\"Query: {q}\")\n", + "resp = app.query(yql=q)\n", + "group_data = resp.hits[0][\"children\"][0][\"children\"]\n", + "records = [\n", + " {\n", + " \"GroupId\": group[\"value\"],\n", + " \"Date\": date_entry[\"value\"],\n", + " \"Sum(price)\": date_entry[\"fields\"].get(\"sum(price)\", 0),\n", + " }\n", + " for group in group_data\n", + " for date_group in group.get(\"children\", [])\n", + " for date_entry in date_group.get(\"children\", [])\n", + "]\n", + "\n", + "# Create DataFrame\n", + "df = pd.DataFrame(records)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 5 - Grouping with expressions\n", + "\n", + " Instead of just grouping on some attribute value, the group clause may contain arbitrarily complex expressions - see [Grouping reference](https://pyvespa.readthedocs.io/en/latest/reference-api.html#vespa.querybuilder.Grouping) for exhaustive list. \n", + " \n", + " Examples:\n", + "\n", + "- Select the minimum or maximum of sub-expressions\n", + "- Addition, subtraction, multiplication, division, and even modulo of - sub-expressions\n", + "- Bitwise operations on sub-expressions\n", + "- Concatenation of the results of sub-expressions\n", + "\n", + "Let's use some of these expressions to get the sum the prices of purchases on a per-hour-of-day basis." + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: select * from purchase where true | all(group(mod(div(date, mul(60, 60)),24)) order(-sum(price)) each(output(sum(price))))\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
valuesum(price)
01026181
1923524
2822367
3116765
471000
\n", + "
" + ], + "text/plain": [ + " value sum(price)\n", + "0 10 26181\n", + "1 9 23524\n", + "2 8 22367\n", + "3 11 6765\n", + "4 7 1000" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from vespa.querybuilder import Grouping as G\n", + "\n", + "grouping = G.all(\n", + " G.group(G.mod(G.div(\"date\", G.mul(60, 60)), 24)),\n", + " G.order(-G.sum(\"price\")),\n", + " G.each(G.output(G.sum(\"price\"))),\n", + ")\n", + "q = qb.select(\"*\").from_(\"purchase\").where(True).groupby(grouping)\n", + "print(f\"Query: {q}\")\n", + "resp = app.query(yql=q)\n", + "group_data = resp.hits[0][\"children\"][0][\"children\"]\n", + "df = pd.DataFrame([hit[\"fields\"] | hit for hit in group_data])\n", + "df = df.loc[:, [\"value\", \"sum(price)\"]]\n", + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" } }, "nbformat": 4,