order-by-select

lightonai · Oct 2, 2024 · a3883cc · a3883cc
1 parent dc49ad0
commit a3883cc
Show file tree

Hide file tree

Showing 5 changed files with 142 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -73,7 +73,7 @@ upload.documents(
 
 ## Search
 
-`search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8.
+`search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8. We will order the results by a weighted sum of the BM25 score and the popularity score provided in the document.
 
 ```python
 from ducksearch import search
@@ -83,6 +83,7 @@ search.documents(
     queries=["punk", "california"],
     top_k=10,
     filters="YEAR(date) >= 1970 AND popularity > 8",
+    order_by="0.8 * score + 0.2 * popularity DESC",
 )
 ```
 
@@ -113,6 +114,8 @@ search.documents(
 
 Filters are SQL expressions that are applied to the search results. We can use every filtering function DuckDB provides such as [date functions](https://duckdb.org/docs/sql/functions/date).
 
+Both `filters` and `order_by` parameters are optional. If not provided, the results are ordered by BM25 relevance and no filters are applied.
+
 ## Delete and update index
 
 We can delete documents and update the BM25 weights accordingly using the `delete.documents` function.
@@ -132,62 +135,76 @@ To update the index, we should first delete the documents and then upload the up
 
 ### HuggingFace
 
-The `upload.documents` function can also index HuggingFace datasets directly from the url. 
-The following example demonstrates how to index the FineWeb dataset from HuggingFace:
+The `upload.documents` function can also index HuggingFace datasets directly from the url. The following example demonstrates how to index the FineWeb dataset from HuggingFace. We will use the fields "text" and "url" for search. We will also specify the data types for the "date", "token_count", and "language_score" fields to be able to filter the results.
 
 ```python
 from ducksearch import upload
 
 upload.documents(
     database="fineweb.duckdb",
     key="id",
-    fields=["text", "url", "date", "language", "token_count", "language_score"],
+    fields=["text", "url"],
     documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet",
     dtypes={
         "date": "DATE",
         "token_count": "INT",
         "language_score": "FLOAT",
     },
-    limit=1000, # demonstrate with a small dataset
+    limit=3000, # demonstrate with a small dataset
 )
 ```
 
-We can then search the FineWeb dataset with the `search.documents` function:
+We can then search the FineWeb dataset with the `search.documents` function. We order the results by BM25 score and then date.
 
 ```python
 from ducksearch import search
 
 search.documents(
     database="fineweb.duckdb",
-    queries="earth science",
+    queries=["earth science"],
     top_k=2,
+    order_by="score DESC, date DESC",
 )
 ```
 
 ```python
 [
-    {
-        "id": "<urn:uuid:1e6ae53b-e0d7-431b-8d46-290244e597e9>",
-        "text": "Earth Science Tutors in Rowland ...",
-        "date": Timestamp("2017-08-19 00:00:00"),
-        "language": "en",
-        "token_count": 313,
-        "language_score": 0.8718525171279907,
-        "score": 1.1588547229766846,
-    },
-    {
-        "score": 1.6727683544158936,
-        "id": "<urn:uuid:c732ce90-2fbf-41ad-8916-345f6c08e452>",
-        "text": "The existing atmosphere surrounding the earth contains ...",
-        "url": "http://www.accuracyingenesis.com/atmargon.html",
-        "date": Timestamp("2015-04-02 00:00:00"),
-        "language": "en",
-        "token_count": 1348,
-        "language_score": 0.9564403295516968,
-    },
+    [
+        {
+            "id": "<urn:uuid:1e6ae53b-e0d7-431b-8d46-290244e597e9>",
+            "text": "Earth Science Tutors in Rowland...",
+            "id_1": "<urn:uuid:1e6ae53b-e0d7-431b-8d46-290244e597e9>",
+            "dump": "CC-MAIN-2017-34",
+            "url": "http://rowland.universitytutor.com/rowland_earth-science-tutoring",
+            "date": Timestamp("2017-08-19 00:00:00"),
+            "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2017-34/segments/1502886105304.35/warc/CC-MAIN-20170819051034-20170819071034-00240.warc.gz",
+            "language": "en",
+            "language_score": 0.8718525171279907,
+            "token_count": 313,
+            "bm25id": 523,
+            "score": 2.3761106729507446,
+        },
+        {
+            "id": "<urn:uuid:cd94a04f-1632-4c8b-81d2-cb353163116e>",
+            "text": "- Geomagnetic field....",
+            "id_1": "<urn:uuid:cd94a04f-1632-4c8b-81d2-cb353163116e>",
+            "dump": "CC-MAIN-2022-21",
+            "url": "https://www.imperial.ac.uk/people/adrian.muxworthy/?respub-action=citation.html&id=1149861&noscript=noscript",
+            "date": Timestamp("2022-05-20 00:00:00"),
+            "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2022-21/segments/1652662530553.34/warc/CC-MAIN-20220519235259-20220520025259-00601.warc.gz",
+            "language": "en",
+            "language_score": 0.8225595951080322,
+            "token_count": 517,
+            "bm25id": 4783,
+            "score": 2.3569871187210083,
+        },
+    ]
 ]
+
 ```
 
+Note: by default, results are ordered by BM25 relevance.
+
 ## Tables
 
 Ducksearch creates two distinct schemas: `bm25_tables`, `bm25_documents`.

diff --git a/ducksearch/search/select.py b/ducksearch/search/select.py
@@ -42,6 +42,15 @@ def _search_query():
     """Perform a search on the documents or queries table in DuckDB."""
 
 
+@execute_with_duckdb(
+    relative_path="search/select/search_order_by.sql",
+    read_only=True,
+    fetch_df=True,
+)
+def _search_query_order_by():
+    """Perform a search on the documents or queries table in DuckDB."""
+
+
 @execute_with_duckdb(
     relative_path="search/select/search_filters.sql",
     read_only=True,
@@ -60,6 +69,7 @@ def documents(
     n_jobs: int = -1,
     config: dict | None = None,
     filters: str | None = None,
+    order_by: str | None = None,
     tqdm_bar: bool = True,
 ) -> list[list[dict]]:
     """Search for documents in the documents table using specified queries.
@@ -118,6 +128,7 @@ def documents(
         top_k_token=top_k_token,
         n_jobs=n_jobs,
         filters=filters,
+        order_by=order_by,
         tqdm_bar=tqdm_bar,
     )
 
@@ -198,6 +209,7 @@ def _search(
     random_hash: str,
     config: dict | None = None,
     filters: str | None = None,
+    order_by: str | None = None,
 ) -> list:
     """Perform a search on the specified source table (documents or queries).
 
@@ -231,6 +243,11 @@ def _search(
     """
     search_function = _search_query_filters if filters is not None else _search_query
 
+    if filters is None and order_by is not None:
+        search_function = _search_query_order_by
+
+    order_by = f"ORDER BY {order_by}" if order_by is not None else "ORDER BY score DESC"
+
     matchs = search_function(
         database=database,
         schema=schema,
@@ -242,6 +259,7 @@ def _search(
         group_id=group_id,
         filters=filters,
         config=config,
+        order_by=order_by,
     )
 
     candidates = collections.defaultdict(list)
@@ -265,6 +283,7 @@ def search(
     n_jobs: int = -1,
     config: dict | None = None,
     filters: str | None = None,
+    order_by: str | None = None,
     tqdm_bar: bool = True,
 ) -> list[list[dict]]:
     """Run the search for documents or queries in parallel.
@@ -397,6 +416,7 @@ def search(
                     random_hash=random_hash,
                     config=config,
                     filters=filters,
+                    order_by=order_by,
                 )
             )
             if tqdm_bar:
@@ -420,7 +440,8 @@ def search(
                 group_id,
                 random_hash,
                 config,
-                filters=filters,
+                filters,
+                order_by,
             )
             for group_id, batch_queries in batchs.items()
         ):

diff --git a/ducksearch/search/select/search_filters.sql b/ducksearch/search/select/search_filters.sql
@@ -63,11 +63,12 @@ _partition_scores AS (
         _query,
         _score AS score,
         * EXCLUDE (_score, _query),
-        RANK() OVER (PARTITION BY _query ORDER BY _score DESC) AS _row_number
+        RANK() OVER (PARTITION BY _query {order_by}, RANDOM() ASC) AS _row_number
     FROM _filtered_scores
     QUALIFY _row_number <= {top_k}
 )
 
 SELECT 
     * EXCLUDE (_row_number)
-FROM _partition_scores;
+FROM _partition_scores
+{order_by};
diff --git a/ducksearch/search/select/search_order_by.sql b/ducksearch/search/select/search_order_by.sql
@@ -0,0 +1,74 @@
+WITH group_queries AS (
+    SELECT
+        query
+    FROM {schema}._queries_{random_hash}
+    WHERE group_id = {group_id}
+),
+
+ _input_queries AS (
+    SELECT
+        pf.query,
+        ftsdict.term
+    FROM group_queries pf
+    JOIN fts_{schema}__queries_{random_hash}.docs docs
+        ON pf.query = docs.name
+    JOIN fts_{schema}__queries_{random_hash}.terms terms
+        ON docs.docid = terms.docid
+    JOIN fts_{schema}__queries_{random_hash}.dict ftsdict
+        ON terms.termid = ftsdict.termid
+),
+
+_nested_matchs AS (
+    SELECT
+        iq.query,
+        s.list_docids[0:{top_k_token}] as list_docids,
+        s.list_scores[0:{top_k_token}] as list_scores
+    FROM {schema}.scores s
+    INNER JOIN _input_queries iq
+        ON s.term = iq.term
+),
+
+_matchs AS (
+    SELECT
+        query,
+        UNNEST(
+            s.list_docids
+        ) AS bm25id,
+        UNNEST(
+            s.list_scores
+        ) AS score
+    FROM _nested_matchs s
+),
+
+_matchs_scores AS (
+    SELECT 
+        query,
+        bm25id,
+        SUM(score) AS score
+    FROM _matchs
+    GROUP BY 1, 2
+),
+
+_match_scores_documents AS (
+    SELECT
+        ms.query AS _query,
+        ms.bm25id,
+        ms.score,
+        s.*
+    FROM _matchs_scores ms
+    INNER JOIN {source_schema}.{source} s
+        ON ms.bm25id = s.bm25id
+),
+
+_partition_scores AS (
+    SELECT
+        *,
+        RANK() OVER (PARTITION BY _query {order_by}, RANDOM() ASC) AS rank
+    FROM _match_scores_documents
+    QUALIFY rank <= {top_k}
+)
+
+SELECT 
+    * 
+FROM _partition_scores
+{order_by};
diff --git a/ducksearch/upload/upload.py b/ducksearch/upload/upload.py
@@ -154,7 +154,6 @@ def documents(
         config=config,
     )
 
-    fields = [field for field in fields if field != "id"]
     update_index_documents(
         database=database,
         fields=fields,