Skip to content

Commit

Permalink
update-index
Browse files Browse the repository at this point in the history
  • Loading branch information
raphaelsty committed Sep 9, 2024
1 parent 1ffea81 commit fc0dc6b
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 12 deletions.
86 changes: 75 additions & 11 deletions ducksearch/search/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ def _create_tables() -> None:
"""Create the necessary index tables in the DuckDB database."""


@execute_with_duckdb(
relative_path="search/drop/scores.sql",
)
def _drop_scores_to_recompute() -> None:
"""Drop the BM25 scores to recompute from the DuckDB database."""


@execute_with_duckdb(
relative_path="search/select/settings_exists.sql",
fetch_df=True,
Expand Down Expand Up @@ -253,7 +260,12 @@ def update_index(
if os.path.exists("_stopwords.parquet"):
os.remove("_stopwords.parquet")

_create_settings(database=database, schema=bm25_schema, config=config)
_create_settings(
database=database,
schema=bm25_schema,
config=config,
)

_insert_settings(
database=database,
schema=bm25_schema,
Expand All @@ -267,7 +279,11 @@ def update_index(
config=config,
)

settings = _select_settings(database=database, schema=bm25_schema, config=config)[0]
settings = _select_settings(
database=database,
schema=bm25_schema,
config=config,
)[0]

if (
settings["k1"] != k1
Expand All @@ -286,15 +302,49 @@ def update_index(
_create_index(database=database, schema=bm25_schema, **settings, config=config)

logging.info("Updating index metadata.")
_update_dict(database=database, schema=bm25_schema, config=config)
_update_docs(database=database, schema=bm25_schema, config=config)
_update_stats(database=database, schema=bm25_schema, config=config)
_update_terms(database=database, schema=bm25_schema, config=config)
_update_dict(
database=database,
schema=bm25_schema,
config=config,
)

_update_docs(
database=database,
schema=bm25_schema,
config=config,
)

_update_stats(
database=database,
schema=bm25_schema,
config=config,
)

_update_terms(
database=database,
schema=bm25_schema,
config=config,
)

termids_to_score = _termids_to_score(
database=database, schema=bm25_schema, config=config, max_df=100_000
database=database,
schema=bm25_schema,
config=config,
max_df=100_000,
)

_drop_scores_to_recompute(
database=database,
schema=bm25_schema,
config=config,
)
stats = _stats(database=database, schema=bm25_schema, config=config)[0]

stats = _stats(
database=database,
schema=bm25_schema,
config=config,
)[0]

num_docs = stats["num_docs"]
avgdl = stats["avgdl"]

Expand All @@ -313,8 +363,18 @@ def update_index(
config=config,
)

_drop_schema(database=database, schema=bm25_schema, config=config)
_drop_documents(database=database, schema=bm25_schema, config=config)
_drop_schema(
database=database,
schema=bm25_schema,
config=config,
)

_drop_documents(
database=database,
schema=bm25_schema,
config=config,
)

_update_bm25id(
database=database,
schema=bm25_schema,
Expand Down Expand Up @@ -382,7 +442,11 @@ def update_index_documents(
"""
fields = ", ".join(
select_documents_columns(database=database, schema="bm25_tables", config=config)
select_documents_columns(
database=database,
schema="bm25_tables",
config=config,
)
)

update_index(
Expand Down
11 changes: 11 additions & 0 deletions ducksearch/search/drop/scores.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
WITH _terms_scores_to_drop AS (
SELECT DISTINCT
d.term
FROM fts_{schema}__documents.dict fts
INNER JOIN {schema}.dict d
ON fts.term = d.term
)

DELETE FROM {schema}.scores s
USING _terms_scores_to_drop t
WHERE s.term = t.term;
1 change: 0 additions & 1 deletion ducksearch/search/update/dict.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,3 @@ UPDATE {schema}.dict d
SET df = d.df + nt.df
FROM new_terms nt
WHERE d.termid = nt.existing_id;

0 comments on commit fc0dc6b

Please sign in to comment.