-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8d8cef4
commit af848fc
Showing
18 changed files
with
360 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
DELETE FROM {schema}.documents_queries | ||
USING parquet_scan('{parquet_file}') AS _df_documents | ||
WHERE {schema}.documents_queries.document_id = _df_documents.id; |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
-- This query finds the set of tokens scores for which there won't be any docid / score to keep. | ||
WITH _docs_to_delete AS ( | ||
SELECT DISTINCT | ||
bm25.docid | ||
FROM parquet_scan('{parquet_file}') p | ||
INNER JOIN bm25_documents.docs bm25 | ||
ON p.id = bm25.name | ||
), | ||
|
||
_terms_to_recompute AS ( | ||
SELECT DISTINCT | ||
term | ||
FROM bm25_documents.terms | ||
INNER JOIN _docs_to_delete | ||
ON bm25_documents.terms.docid = _docs_to_delete.docid | ||
INNER JOIN bm25_documents.dict | ||
ON bm25_documents.terms.termid = bm25_documents.dict.termid | ||
), | ||
|
||
_scores_to_update AS ( | ||
SELECT | ||
_bm25.term, | ||
_bm25.list_scores, | ||
_bm25.list_docids | ||
FROM bm25_documents.scores _bm25 | ||
INNER JOIN _terms_to_recompute _terms | ||
ON _bm25.term = _terms.term | ||
), | ||
|
||
_unested_scores AS ( | ||
SELECT | ||
term, | ||
UNNEST(list_scores) AS score, | ||
UNNEST(list_docids) AS docid | ||
FROM _scores_to_update | ||
), | ||
|
||
_unested_unfiltered_scores AS ( | ||
SELECT | ||
_scores.term, | ||
_scores.docid, | ||
_scores.score, | ||
_docs.docid AS to_delete | ||
FROM _unested_scores _scores | ||
LEFT JOIN _docs_to_delete _docs | ||
ON _scores.docid = _docs.docid | ||
), | ||
|
||
_unested_filtered_scores AS ( | ||
SELECT | ||
term, | ||
docid, | ||
score | ||
FROM _unested_unfiltered_scores | ||
WHERE to_delete IS NULL | ||
), | ||
|
||
_terms_to_delete AS ( | ||
SELECT DISTINCT | ||
ttr.term, | ||
ufs.term AS missing | ||
FROM _terms_to_recompute ttr | ||
LEFT JOIN _unested_filtered_scores ufs | ||
ON ttr.term = ufs.term | ||
), | ||
|
||
_scores_to_delete_completely AS ( | ||
SELECT DISTINCT | ||
term, | ||
FROM _terms_to_delete | ||
WHERE missing IS NULL | ||
) | ||
|
||
DELETE FROM bm25_documents.scores as _scores | ||
USING _scores_to_delete_completely as _scores_to_delete | ||
WHERE _scores.term = _scores_to_delete.term; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
DELETE FROM bm25_documents.docs as _docs | ||
USING parquet_scan('{parquet_file}') AS _df_documents | ||
WHERE _docs.name = _df_documents.id; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
-- This query finds the set of tokens scores for which there won't be any docid / score to keep. | ||
WITH _docs_to_delete AS ( | ||
SELECT DISTINCT | ||
bm25.docid | ||
FROM parquet_scan('{parquet_file}') p | ||
INNER JOIN bm25_documents.docs bm25 | ||
ON p.id = bm25.name | ||
), | ||
|
||
_terms_to_recompute AS ( | ||
SELECT DISTINCT | ||
term | ||
FROM bm25_documents.terms | ||
INNER JOIN _docs_to_delete | ||
ON bm25_documents.terms.docid = _docs_to_delete.docid | ||
INNER JOIN bm25_documents.dict | ||
ON bm25_documents.terms.termid = bm25_documents.dict.termid | ||
), | ||
|
||
_scores_to_update AS ( | ||
SELECT | ||
_bm25.term, | ||
_bm25.list_scores, | ||
_bm25.list_docids | ||
FROM bm25_documents.scores _bm25 | ||
INNER JOIN _terms_to_recompute _terms | ||
ON _bm25.term = _terms.term | ||
), | ||
|
||
_unested_scores AS ( | ||
SELECT | ||
term, | ||
UNNEST(list_scores) AS score, | ||
UNNEST(list_docids) AS docid | ||
FROM _scores_to_update | ||
), | ||
|
||
_unested_unfiltered_scores AS ( | ||
SELECT | ||
_scores.term, | ||
_scores.docid, | ||
_scores.score, | ||
_docs.docid AS to_delete | ||
FROM _unested_scores _scores | ||
LEFT JOIN _docs_to_delete _docs | ||
ON _scores.docid = _docs.docid | ||
), | ||
|
||
_unested_filtered_scores AS ( | ||
SELECT | ||
term, | ||
docid, | ||
score | ||
FROM _unested_unfiltered_scores | ||
WHERE to_delete IS NULL | ||
), | ||
|
||
_list_scores AS ( | ||
SELECT | ||
term, | ||
LIST(docid ORDER BY score DESC, docid ASC) AS list_docids, | ||
LIST(score ORDER BY score DESC, docid ASC) AS list_scores | ||
FROM _unested_filtered_scores | ||
GROUP BY 1 | ||
) | ||
|
||
UPDATE bm25_documents.scores s | ||
SET | ||
list_docids = u.list_docids, | ||
list_scores = u.list_scores | ||
FROM _list_scores u | ||
WHERE s.term = u.term; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
WITH _stats AS ( | ||
SELECT | ||
COUNT(*) AS num_docs, | ||
AVG(len) AS avgdl | ||
FROM bm25_documents.docs | ||
) | ||
|
||
UPDATE bm25_documents.stats | ||
SET num_docs = _stats.num_docs, | ||
avgdl = _stats.avgdl | ||
FROM _stats; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
WITH _docs_to_delete AS ( | ||
SELECT | ||
bm25.docid | ||
FROM parquet_scan('{parquet_file}') p | ||
INNER JOIN bm25_documents.docs bm25 | ||
ON p.id = bm25.name | ||
) | ||
|
||
DELETE FROM bm25_documents.terms as _terms | ||
USING _docs_to_delete as _docs | ||
WHERE _terms.docid = _docs.docid; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.