Skip to content

Commit

Permalink
temporal data summation final script changes
Browse files Browse the repository at this point in the history
  • Loading branch information
larrybabb committed Jan 17, 2025
1 parent 7d72d1b commit 8c9a271
Show file tree
Hide file tree
Showing 12 changed files with 520 additions and 1,250 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
CREATE OR REPLACE PROCEDURE `clinvar_ingest.clinvar_var_scv_change`()

-- this procedure creates a table that contains a record for each variation with a start-end range
-- that represents every time the scvs associated with the variation change in some way.
-- "change" is defined by the uniqueness of records in the clinvar_scvs temporal collection table.

-- scvs can change for obvious reasons (e.g. new submissions, updated submissions, deleted submissions),
-- but also for less obvious reasons (e.g. internal trait-set-id change or reassigned to a new rcv_accession_id...).
CREATE OR REPLACE PROCEDURE `clinvar_ingest.clinvar_sum_variation_scv_change`()
BEGIN

CREATE TEMP TABLE _SESSION.release_start_vsc
Expand All @@ -18,15 +25,12 @@ BEGIN
FROM `clinvar_ingest.clinvar_scvs` vs
UNION DISTINCT
SELECT
MIN(r.release_date) as start_release_date,
r.next_release_date as start_release_date,
vs.variation_id
FROM `clinvar_ingest.clinvar_scvs` vs
JOIN `clinvar_ingest.clinvar_releases` r
JOIN `clinvar_ingest.all_schemas`() r
ON
r.release_date > vs.end_release_date
GROUP BY
vs.end_release_date,
vs.variation_id
r.release_date = vs.end_release_date
) st;

CREATE TEMP TABLE _SESSION.release_end_vsc
Expand All @@ -46,18 +50,15 @@ BEGIN
FROM `clinvar_ingest.clinvar_scvs` vs
UNION DISTINCT
SELECT
MAX(r.release_date) as end_release_date,
r.prev_release_date as end_release_date,
vs.variation_id
FROM `clinvar_ingest.clinvar_scvs` vs
JOIN `clinvar_ingest.clinvar_releases` r
JOIN `clinvar_ingest.all_schemas`() r
ON
r.release_date < vs.start_release_date
GROUP BY
vs.start_release_date,
vs.variation_id
r.release_date = vs.start_release_date
) en;

CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_var_scv_change`
CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_sum_variation_scv_change`
AS
SELECT
e.variation_id,
Expand Down
103 changes: 103 additions & 0 deletions scripts/temporal-data-summation/02-clinvar-sum-vsp-rank-group-proc.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
CREATE OR REPLACE PROCEDURE `clinvar_ingest.clinvar_sum_vsp_rank_group`()
BEGIN

EXECUTE IMMEDIATE FORMAT("""
-- create a grouping of scvs based on the var/rank/prop-type/stmt-type to produce the
-- array of counts & percentages of the 3 major significance categories of scvs within that group
-- (do not introduce clinical_impact_clinical_significance here)
CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_sum_vsp_rank_group`
AS
WITH x AS
(
-- this gets us the count of scvs for a given rank and clinical significance
-- within a statement_type and gks_proposition_type. It does this for
-- each variation_id change for a given scv
SELECT
vs.variation_id,
vsc.start_release_date,
vsc.end_release_date,
vs.statement_type,
vs.gks_proposition_type,
vs.rank,
vs.clinsig_type,
vs.classif_type,
(vs.classif_type||'('||COUNT(DISTINCT vs.id)||')') AS classif_type_w_count
FROM `clinvar_ingest.clinvar_scvs` vs
JOIN `clinvar_ingest.clinvar_sum_variation_scv_change` vsc
ON
vs.variation_id = vsc.variation_id
AND
vs.start_release_date <= vsc.end_release_date
AND
vs.end_release_date >= vsc.start_release_date
GROUP BY
vs.variation_id,
vsc.start_release_date,
vsc.end_release_date,
vs.statement_type,
vs.gks_proposition_type,
vs.rank,
vs.classif_type,
vs.clinsig_type
)
SELECT
x.start_release_date,
x.end_release_date,
x.variation_id,
x.statement_type,
x.gks_proposition_type,
x.rank,
COUNT(DISTINCT vs.clinsig_type) as unique_clinsig_type_count,
SUM(DISTINCT IF(vs.clinsig_type=2,4,IF(vs.clinsig_type=1,2,1))) as agg_sig_type,
`clinvar_ingest.createSigType`(
COUNT(DISTINCT IF(vs.clinsig_type = 0, vs.submitter_id, NULL)),
COUNT(DISTINCT IF(vs.clinsig_type = 1, vs.submitter_id, NULL)),
COUNT(DISTINCT IF(vs.clinsig_type = 2, vs.submitter_id, NULL))
) as sig_type,
MAX(vs.last_evaluated) as max_last_evaluated,
MAX(vs.submission_date) as max_submission_date,
COUNT(DISTINCT vs.id) as submission_count,
COUNT(DISTINCT vs.submitter_id) as submitter_count,
STRING_AGG(DISTINCT x.classif_type, '/' ORDER BY x.classif_type) AS agg_classif,
STRING_AGG(DISTINCT x.classif_type_w_count, '/' ORDER BY x.classif_type_w_count) AS agg_classif_w_count
FROM x
JOIN `clinvar_ingest.clinvar_scvs` vs
ON
vs.variation_id = x.variation_id
AND
vs.statement_type IS NOT DISTINCT FROM x.statement_type
AND
vs.gks_proposition_type IS NOT DISTINCT FROM x.gks_proposition_type
AND
vs.rank IS NOT DISTINCT FROM x.rank
AND
vs.start_release_date <= x.end_release_date
AND
vs.end_release_date >= x.start_release_date
GROUP BY
x.variation_id,
x.start_release_date,
x.end_release_date,
x.statement_type,
x.gks_proposition_type,
x.rank
""");

END;


-- find intersection between voi and voi_scv windows for the same variant to create the voi_group records
-- date window intersection is found by using the condition ((start_window1 <= end_window2) AND (end_window1 >= start_window2))
-- the start and end dates are always inclusive, meaning the start date is the date that the record is first available and
-- the end date is the date that the record is last available.
-- https://stackoverflow.com/questions/325933/determine-whether-two-date-ranges-overlap
-- (s1 <= eX) AND (e1 >= sX)

-- A s1--------------e1
-- |----|----|----|----|----|----|----|
-- B s2------e2 s1 <= e2 AND e1 >= s2. TRUE
-- C s3------------------------e3 s1 <= e3 AND e1 >= s3. TRUE
-- D s4------e4 s1 <= e4 AND e1 >= s4. TRUE
-- E s5----------e5 s1 <= e5 AND e1 >= s5. TRUE
-- F s6--e6 s1 <= e6 AND e1 >= s6. FALSE
-- G. s7--e7 s1 <= e7 AND e1 >= s7. FALSE
179 changes: 0 additions & 179 deletions scripts/temporal-data-summation/02-voi-vcv-scv-proc.sql

This file was deleted.

Loading

0 comments on commit 8c9a271

Please sign in to comment.