From 9cae79b55277b7507151185426ac6528cbf5228d Mon Sep 17 00:00:00 2001 From: Lawrence Babb Date: Tue, 17 Dec 2024 11:09:40 -0500 Subject: [PATCH] updates to the variation_tracker procs --- scripts/tracker-procs/03-gc-report-proc.sql | 345 ----------------- .../01-report-variation-proc.sql | 2 +- .../02-tracker-reports-rebuild-proc.sql} | 111 ++++-- .../03-gc-tracker-report-proc.sql | 356 ++++++++++++++++++ .../initialize-tracker-tables.sql | 0 .../tracker-report-update-proc.sql | 18 + 6 files changed, 445 insertions(+), 387 deletions(-) delete mode 100644 scripts/tracker-procs/03-gc-report-proc.sql rename scripts/{tracker-procs => tracker-report-update}/01-report-variation-proc.sql (95%) rename scripts/{tracker-procs/02-variation-track-proc.sql => tracker-report-update/02-tracker-reports-rebuild-proc.sql} (88%) create mode 100644 scripts/tracker-report-update/03-gc-tracker-report-proc.sql rename scripts/{tracker-procs => tracker-report-update}/initialize-tracker-tables.sql (100%) create mode 100644 scripts/tracker-report-update/tracker-report-update-proc.sql diff --git a/scripts/tracker-procs/03-gc-report-proc.sql b/scripts/tracker-procs/03-gc-report-proc.sql deleted file mode 100644 index ee66733..0000000 --- a/scripts/tracker-procs/03-gc-report-proc.sql +++ /dev/null @@ -1,345 +0,0 @@ -CREATE OR REPLACE PROCEDURE `variation_tracker.gc_report_proc`(start_with DATE) -BEGIN - - FOR rec IN (select s.schema_name, s.release_date, s.prev_release_date, s.next_release_date FROM clinvar_ingest.schemas_on_or_after(start_with) as s) - DO - - -- vceps for current release - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TEMP TABLE vcep - AS - select - scv.variation_id, - scv.submitter_id, - FORMAT("%%s.%%i", scv.id, scv.version) as scv_acxn, - rs.clinvar_name, - scv.classif_type, - scv.submitted_classification, - scv.last_evaluated - from `variation_tracker.report_submitter` rs - join `%s.scv_summary` scv - on - scv.submitter_id = rs.submitter_id - where - rs.type = "VCEP" - and - rs.submitter_id is not null - """, rec.schema_name); - - -- gc scv info for current release - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TEMP TABLE gc_scv - AS - select - scv.submitter_id, - scv.variation_id, - gscv.id, - FORMAT("%%s.%%i", gscv.id, gscv.version) as scv_acxn, - IF(scv.local_key IS NULL, NULL, SPLIT(scv.local_key, "|")[0]) as local_key, - scv.local_key as local_key_orig, - scv.date_created as first_in_clinvar, - scv.classification_comment, - COUNT(IFNULL(gscv.lab_id,gscv.lab_name)) as case_count - from `%s.gc_scv` gscv - join `%s.scv_summary` scv - on - scv.id = gscv.id - where - -- these are the dupe gc submissions that are older - gscv.id not in ( - "SCV000607136","SCV000986740", - "SCV000986708","SCV000986786", - "SCV000986705","SCV000986788", - "SCV000986813","SCV000607109" - ) - group by - scv.submitter_id, - scv.variation_id, - gscv.id, - gscv.version, - scv.local_key, - scv.date_created, - scv.classification_comment - """, rec.schema_name, rec.schema_name); - - -- gc scv w/ agg info for current release - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TEMP TABLE gc - AS - select - gc_scv.submitter_id, - gc_scv.variation_id, - g.hgnc_id, - g.symbol, - v.name, - cvcv.agg_classification, - cvcv.rank, - gc_scv.scv_acxn, - gc_scv.local_key, - gc_scv.case_count, - gc_scv.first_in_clinvar, - gc_scv.classification_comment - from gc_scv - join `%s.variation` v - on - v.id = gc_scv.variation_id - left join `%s.single_gene_variation` sgv - on - sgv.variation_id = gc_scv.variation_id - left join `%s.gene` g - on - sgv.gene_id = g.id - join `clinvar_ingest.clinvar_vcvs` cvcv - on - cvcv.variation_id = gc_scv.variation_id - and - %T between cvcv.start_release_date and cvcv.end_release_date - """, rec.schema_name, rec.schema_name, rec.schema_name, rec.release_date); - - -- gc case info for current release - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TEMP TABLE gc_case - AS - select - gscv.variation_id, - gc_scv.scv_acxn, - gc_scv.local_key, - gc_scv.local_key_orig, - gscv.lab_name, - gscv.lab_id, - gscv.lab_classification, - gscv.lab_classif_type, - gscv.lab_date_reported, - gscv.sample_id, - IF(gscv.sample_id IS NULL, gc_scv.local_key, CONCAT(gc_scv.local_key, "|", gscv.sample_id)) as case_report_key - from gc_scv - join `%s.gc_scv` gscv - on - gc_scv.id = gscv.id - """, rec.schema_name); - - -- gc case related lab info fo current release - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TEMP TABLE lab_case - AS - select - gc_case.variation_id, - gc_case.lab_id as submitter_id, - gc_case.case_report_key, - STRING_AGG(DISTINCT FORMAT("%%s.%%i", lab_scv.id, lab_scv.version)) as acxn, - STRING_AGG(DISTINCT lab_scv.classif_type ORDER BY lab_scv.classif_type) as classif_type, - STRING_AGG(DISTINCT lab_scv.submitted_classification ORDER BY lab_scv.submitted_classification) as classification, - MIN(lab_scv.last_evaluated) as last_evaluated, - MIN(lab_scv.date_created) as first_in_clinvar, - COUNT(DISTINCT lab_scv.id) as scv_count - from gc_case - left join `%s.scv_summary` lab_scv - on - lab_scv.submitter_id = gc_case.lab_id and - lab_scv.variation_id = gc_case.variation_id - group by - gc_case.lab_id, - gc_case.variation_id, - gc_case.case_report_key - """, rec.schema_name); - - -- gc var report - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TEMP TABLE var - AS - WITH v AS - ( - select - gc_scv.variation_id, - COUNT(gc_scv.id) as gc_scv_count, - MIN(vcv.date_created) as first_in_clinvar - from gc_scv - join `%s.variation_archive` vcv - on - vcv.variation_id = gc_scv.variation_id - group by - gc_scv.variation_id - ) - -- variation data related to single GC submitter's submissions - select - v.variation_id, - v.first_in_clinvar, - COUNT(distinct sgrp.id) as scv_count, - v.gc_scv_count, - STRING_AGG(split( sgrp.scv_label, "%%")[0]||"%%", "\\n" ORDER BY sgrp.rank desc, sgrp.scv_group_type, sgrp.scv_label) as all_scvs - from v - join `clinvar_ingest.voi_scv_group` sgrp - on - sgrp.variation_id = v.variation_id and - %T between sgrp.start_release_date and sgrp.end_release_date - group by - v.variation_id, - v.first_in_clinvar, - v.gc_scv_count - """, rec.schema_name, rec.release_date); - - -- gc variation report (1 of 2) - first remove all gc_variation records for the release_date being processed - EXECUTE IMMEDIATE FORMAT(""" - DELETE FROM `variation_tracker.gc_variation` - WHERE report_date = %T - """, rec.release_date); - - -- gc variation report (2 of 2)- now insert the newly processed records for the current release_date - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `variation_tracker.gc_variation` - ( - report_date, submitter_id, variation_id, hgnc_id, symbol, - name, agg_classification, rank, clinvar_name, submitted_classification, - classif_type, last_evaluated, scv_acxn, gc_scv_first_in_clinvar, local_key, - gc_case_count,all_scvs, variant_first_in_clinvar, novel_at_first_gc_submission, - novel_as_of_report_run_date, only_other_gc_submitters - ) - -- variant-centric output for single GC submitter - select - %T as report_date, - gc.submitter_id, - gc.variation_id, - gc.hgnc_id, - gc.symbol, - gc.name, - gc.agg_classification, - gc.rank, - vcep.clinvar_name, - vcep.submitted_classification, - vcep.classif_type, - vcep.last_evaluated, - gc.scv_acxn, - gc.first_in_clinvar as gc_scv_first_in_clinvar, - gc.local_key, - gc.case_count as gc_case_count, - var.all_scvs, - var.first_in_clinvar as variant_first_in_clinvar, - IF((var.first_in_clinvar = gc.first_in_clinvar), "Yes", "No") as novel_at_first_gc_submission, - IF((var.scv_count = 1), "Yes", "No") as novel_as_of_report_run_date, - IF((var.scv_count > 1 AND var.scv_count = var.gc_scv_count), "Yes", "No") as only_other_gc_submitters - from gc - left join vcep - on - vcep.variation_id = gc.variation_id - left join var - on - var.variation_id = gc.variation_id - -- ORDER BY 1, CAST(gc.variation_id as INT) - """, rec.release_date); - - -- gc case report (1 of 2) - first remove all gc_case records for the release_date being processed - EXECUTE IMMEDIATE FORMAT(""" - DELETE FROM `variation_tracker.gc_case` - WHERE report_date = %T - """, rec.release_date); - - -- gc case report (2 of 2)- now insert the newly processed records for the current release_date - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `variation_tracker.gc_case` - ( - report_date,submitter_id,variation_id,gene_id,gene_symbol,variant_name, - ep_name,ep_classification, ep_classif_type,ep_last_evaluated_date, - case_report_lab_name,case_report_lab_id,case_report_lab_classification, - case_report_lab_classif_type,case_report_lab_date_reported, - gc_scv_acxn,gc_scv_first_in_clinvar,gc_scv_local_key,case_report_sample_id, - lab_scv_classification,lab_scv_classif_type,lab_scv_last_evaluated, - lab_scv_first_in_clinvar,lab_scv_before_gc_scv,lab_scv_in_clinvar_as_of_release, - ep_diff_alert,lab_diff_alert,classification_comment - ) - select - %T as report_date, - gc.submitter_id, - gc.variation_id, - gc.hgnc_id as gene_id, - gc.symbol as gene_symbol, - gc.name as variant_name, - vcep.clinvar_name as ep_name, - vcep.submitted_classification as ep_classification, - vcep.classif_type as ep_classif_type, - vcep.last_evaluated as ep_last_evaluated_date, - gc_case.lab_name as case_report_lab_name, - gc_case.lab_id as case_report_lab_id, - gc_case.lab_classification as case_report_lab_classification, - gc_case.lab_classif_type as case_report_lab_classif_type, - gc_case.lab_date_reported as case_report_lab_date_reported, - gc.scv_acxn as gc_scv_acxn, - gc.first_in_clinvar as gc_scv_first_in_clinvar, - gc.local_key as gc_scv_local_key, - gc_case.sample_id as case_report_sample_id, - -- classification - lab_case.classification as lab_scv_classification, - -- classification type - lab_case.classif_type as lab_scv_classif_type, - -- last eval'd - lab_case.last_evaluated as lab_scv_last_evaluated, - -- do not show lab_scv_first_in_clinvar unless the lab_scv_count is 1 - IF(lab_case.scv_count=1,lab_case.first_in_clinvar, null) as lab_scv_first_in_clinvar, - -- show error if more than 1 scv exists on variant for case report submitter - CASE lab_case.scv_count - WHEN 0 THEN - null - WHEN 1 THEN - IF(gc.first_in_clinvar <= lab_case.first_in_clinvar, "No", "Yes") - ELSE - "Error: multiple lab scvs." - END as lab_scv_before_gc_scv, - -- is lab_case.scv_count = 1 then the lab scv is submitted at time of clinvar release, error if more than one scv from lab in release - CASE lab_case.scv_count - WHEN 0 THEN - null - WHEN 1 THEN - "Yes" - ELSE - "Error: multiple lab scvs." - END as lab_scv_in_clinvar_as_of_release, - -- alert for VCEP diff, show null if no vcep scv or if VCEP classification exactly matches GC CASE report classification - CASE - WHEN vcep.classif_type IS NULL THEN - null - WHEN (IFNULL(gc_case.lab_classif_type,"n/a") <> vcep.classif_type) THEN - FORMAT("%%s vs %%s (%%s)", - UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), - UPPER(vcep.classif_type), - IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > vcep.last_evaluated, "<",">")) - ) - ELSE - null - END as ep_diff_alert, - -- alert for LAB diff, show null if no vcep scv or if LAB classification exactly matches GC CASE report classification - -- show error if more than 1 scv exists on variant for case report submitter - CASE - WHEN lab_case.scv_count=1 AND (IFNULL(gc_case.lab_classif_type,"n/a") <> lab_case.classif_type) THEN - FORMAT("%%s vs %%s (%%s)", - UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), - UPPER(lab_case.classif_type), - IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > lab_case.last_evaluated, "<",">")) - ) - WHEN lab_case.scv_count > 1 THEN - -- error - "Error: multiple lab scvs." - ELSE - -- lab_case count = 0 OR gc_case and lab_case classifications match so do nothing - null - END as lab_diff_alert, - gc.classification_comment - from gc - left join vcep - on - vcep.variation_id = gc.variation_id - left join gc_case - on - gc.scv_acxn = gc_case.scv_acxn - left join lab_case - on - lab_case.variation_id = gc_case.variation_id and - lab_case.case_report_key = gc_case.case_report_key - """, rec.release_date); - --- -- gc alerts? (TODO) - - - END FOR; - -END; - --- CALL `clinvar_ingest.gc_report_proc`(DATE'2023-01-01'); diff --git a/scripts/tracker-procs/01-report-variation-proc.sql b/scripts/tracker-report-update/01-report-variation-proc.sql similarity index 95% rename from scripts/tracker-procs/01-report-variation-proc.sql rename to scripts/tracker-report-update/01-report-variation-proc.sql index f2a857a..0f751e6 100644 --- a/scripts/tracker-procs/01-report-variation-proc.sql +++ b/scripts/tracker-report-update/01-report-variation-proc.sql @@ -1,5 +1,5 @@ -- report_variations (run when you want to update the variants of inteterest driven by the reporting tables for vceps, etc...) -CREATE OR REPLACE PROCEDURE `variation_tracker.report_variation_proc`() +CREATE OR REPLACE PROCEDURE `variation_tracker.report_variation`() BEGIN CREATE OR REPLACE TABLE `variation_tracker.report_variation` diff --git a/scripts/tracker-procs/02-variation-track-proc.sql b/scripts/tracker-report-update/02-tracker-reports-rebuild-proc.sql similarity index 88% rename from scripts/tracker-procs/02-variation-track-proc.sql rename to scripts/tracker-report-update/02-tracker-reports-rebuild-proc.sql index 018b200..5335b4d 100644 --- a/scripts/tracker-procs/02-variation-track-proc.sql +++ b/scripts/tracker-report-update/02-tracker-reports-rebuild-proc.sql @@ -1,10 +1,13 @@ -CREATE OR REPLACE PROCEDURE `variation_tracker.variation_track_proc`() +CREATE OR REPLACE PROCEDURE `variation_tracker.tracker_reports_rebuild`() BEGIN DECLARE disable_out_of_date_alerts BOOLEAN DEFAULT FALSE; FOR rec IN ( select - r.id, r.name, r.abbrev, lower(format("%s_%s", r.id, r.abbrev)) as tname, + r.id, + r.name, + r.abbrev, + lower(format("%s_%s", r.id, r.abbrev)) as tname, ARRAY_AGG( STRUCT(ro.name, ro.value) ) as opts from `variation_tracker.report` r join `variation_tracker.report_submitter` rs @@ -14,31 +17,49 @@ BEGIN on ro.report_id = r.id group by - r.id, r.name, r.abbrev + r.id, + r.name, + r.abbrev ) DO - SET disable_out_of_date_alerts = (SELECT CAST(IFNULL((SELECT opt.value FROM UNNEST(rec.opts) as opt WHERE opt.name = "DISABLE_OUT_OF_DATE_ALERTS"), "FALSE") AS BOOL)); + SET disable_out_of_date_alerts = + ( + SELECT + CAST( + IFNULL( + ( + SELECT + opt.value + FROM UNNEST(rec.opts) as opt + WHERE opt.name = "DISABLE_OUT_OF_DATE_ALERTS" + ), + "FALSE" + ) AS BOOL + ) + ); EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TABLE `variation_tracker.%s_variation` AS - select + SELECT rv.report_id, cv.release_date as report_release_date, rv.variation_id, vg.rpt_stmt_type, vg.rank, FALSE as report_submitter_variation - - from `variation_tracker.report_variation` rv - join `clinvar_ingest.voi_group` vg on vg.variation_id = rv.variation_id - join `clinvar_ingest.all_schemas`() cv on + FROM `variation_tracker.report_variation` rv + JOIN `clinvar_ingest.voi_group` vg + ON + vg.variation_id = rv.variation_id + JOIN `clinvar_ingest.all_schemas`() cv + ON cv.release_date between vg.start_release_date and vg.end_release_date - where rv.report_id = "%s" + WHERE rv.report_id = "%s" """, rec.tname, rec.id); EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TABLE `variation_tracker.%s_scv` AS - select + SELECT rv.report_id, rv.report_release_date, rv.variation_id, @@ -50,20 +71,23 @@ BEGIN DATE_DIFF(rv.report_release_date, vs.start_release_date, DAY) as released_age, DATE_DIFF(rv.report_release_date, vs.submission_date, DAY) as submission_age, (rs.submitter_id is not NULL) as report_submitter_submission, - from `variation_tracker.%s_variation` rv - join `clinvar_ingest.voi_scv_group` vsg on + FROM `variation_tracker.%s_variation` rv + JOIN `clinvar_ingest.voi_scv_group` vsg + ON vsg.variation_id = rv.variation_id AND vsg.rpt_stmt_type = rv.rpt_stmt_type AND vsg.rank = rv.rank AND rv.report_release_date between vsg.start_release_date AND vsg.end_release_date - join `clinvar_ingest.voi_scv` vs on + JOIN `clinvar_ingest.voi_scv` vs + ON vs.variation_id = vsg.variation_id AND vs.id = vsg.id AND vs.version = vsg.version AND vs.rpt_stmt_type = vsg.rpt_stmt_type AND vs.rank = vsg.rank AND rv.report_release_date between vs.start_release_date and vs.end_release_date - left join `variation_tracker.report_submitter` rs on + LEFT JOIN `variation_tracker.report_submitter` rs + ON rs.report_id = rv.report_id AND vs.submitter_id = rs.submitter_id """, rec.tname, rec.tname); @@ -110,18 +134,21 @@ BEGIN vs.submitter_abbrev, scv.report_submitter_submission FROM `variation_tracker.%s_scv` scv - JOIN `variation_tracker.%s_variation` var on + JOIN `variation_tracker.%s_variation` var + ON scv.variation_id = var.variation_id and scv.report_release_date = var.report_release_date and scv.rpt_stmt_type = var.rpt_stmt_type and scv.rank = var.rank - JOIN `clinvar_ingest.clinvar_status` revstat on + JOIN `clinvar_ingest.clinvar_status` revstat + ON revstat.rank = scv.rank and revstat.scv - JOIN `clinvar_ingest.voi` v on + JOIN `clinvar_ingest.voi` v + ON v.variation_id = scv.variation_id AND scv.report_release_date between v.start_release_date and v.end_release_date - left join `clinvar_ingest.voi_vcv` vv - on + LEFT JOIN `clinvar_ingest.voi_vcv` vv + ON scv.variation_id =vv.variation_id and scv.report_release_date between vv.start_release_date and vv.end_release_date JOIN `clinvar_ingest.voi_scv` vs on @@ -129,7 +156,7 @@ BEGIN vs.id = scv.id AND vs.version = scv.version AND scv.report_release_date between vs.start_release_date and vs.end_release_date - where var.report_submitter_variation + WHERE var.report_submitter_variation ) SELECT vcep.gene_symbol, @@ -186,7 +213,8 @@ BEGIN (vcep.last_eval_age - other.last_eval_age) as newer_last_eval_age, (vcep.released_age - other.released_age) as newer_released_age FROM x as vcep - JOIN x as other on + JOIN x as other + ON other.variation_id = vcep.variation_id AND other.rpt_stmt_type = vcep.rpt_stmt_type AND NOT other.report_submitter_submission AND @@ -252,7 +280,7 @@ BEGIN CREATE OR REPLACE TABLE `variation_tracker.%s_var_priorities` AS WITH x AS ( - select + SELECT v.variation_id, v.rpt_stmt_type, v.report_release_date, @@ -278,8 +306,9 @@ BEGIN 0 END as agg_sig_type, max(vg.rank) as max_rank - from `variation_tracker.%s_variation` v - join `clinvar_ingest.voi_group` vg on + FROM `variation_tracker.%s_variation` v + JOIN `clinvar_ingest.voi_group` vg + ON v.variation_id = vg.variation_id and v.rpt_stmt_type = vg.rpt_stmt_type and v.rank = vg.rank and @@ -291,7 +320,7 @@ BEGIN v.rpt_stmt_type, v.report_release_date ) - select + SELECT x.variation_id, x.rpt_stmt_type, x.report_release_date, @@ -303,8 +332,8 @@ BEGIN select IF(x.agg_sig_type IN ( 3, 7 ), 'VUS vs LBB', NULL) UNION ALL select IF(x.agg_sig_type > 4, 'PLP vs VUSLBB', NULL) UNION ALL select IF(x.max_rank = 0 and x.agg_sig_type >= 4, 'No criteria PLP', NULL)),','),',') as priority_type - from x - where ( + FROM x + WHERE ( (x.agg_sig_type = 2 AND x.unc_sig_cnt > 2) OR (x.agg_sig_type IN ( 3, 7 )) OR (x.agg_sig_type > 4) OR @@ -313,7 +342,7 @@ BEGIN EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TABLE `variation_tracker.%s_scv_priorities` AS - select + SELECT vp.report_release_date, vp.variation_id, vp.rpt_stmt_type, @@ -333,35 +362,35 @@ BEGIN vv.rank as vcv_rank, vv.agg_classification as vcv_classification, rel.next_release_date - from `variation_tracker.%s_var_priorities` vp - cross join unnest(vp.priority_type) as p_type - join `variation_tracker.%s_scv` scv - on + FROM `variation_tracker.%s_var_priorities` vp + CROSS JOIN UNNEST(vp.priority_type) as p_type + JOIN `variation_tracker.%s_scv` scv + ON vp.variation_id = scv.variation_id and vp.report_release_date = scv.report_release_date - join `clinvar_ingest.voi_scv_group` sgrp - on + JOIN `clinvar_ingest.voi_scv_group` sgrp + ON scv.id = sgrp.id and scv.version = sgrp.version and scv.rpt_stmt_type = sgrp.rpt_stmt_type and scv.rank = sgrp.rank and scv.report_release_date between sgrp.start_release_date and sgrp.end_release_date - join `clinvar_ingest.voi` v - on + JOIN `clinvar_ingest.voi` v + ON vp.variation_id = v.variation_id and vp.report_release_date between v.start_release_date and v.end_release_date - left join `clinvar_ingest.voi_vcv` vv - on + LEFT JOIN `clinvar_ingest.voi_vcv` vv + ON vp.variation_id =vv.variation_id and vp.report_release_date between vv.start_release_date and vv.end_release_date - join + JOIN ( select release_date, IF(next_release_date = DATE'9999-12-31', CURRENT_DATE(), next_release_date) next_release_date FROM `clinvar_ingest.schemas_on_or_after`(clinvar_ingest.cvc_project_start_date()) ) rel - on + ON vp.report_release_date = rel.release_date """, rec.tname, rec.tname, rec.tname); diff --git a/scripts/tracker-report-update/03-gc-tracker-report-proc.sql b/scripts/tracker-report-update/03-gc-tracker-report-proc.sql new file mode 100644 index 0000000..7d45ec5 --- /dev/null +++ b/scripts/tracker-report-update/03-gc-tracker-report-proc.sql @@ -0,0 +1,356 @@ +CREATE OR REPLACE PROCEDURE `variation_tracker.gc_tracker_report_rebuild`( + schema_name STRING, + release_date DATE +) +BEGIN + + -- vceps for current release + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TEMP TABLE vcep + AS + select + scv.variation_id, + scv.submitter_id, + FORMAT("%%s.%%i", scv.id, scv.version) as scv_acxn, + rs.clinvar_name, + scv.classif_type, + scv.submitted_classification, + scv.last_evaluated + from `variation_tracker.report_submitter` rs + join `%s.scv_summary` scv + on + scv.submitter_id = rs.submitter_id + where + rs.type = "VCEP" + and + rs.submitter_id is not null + """, schema_name); + + -- gc scv info for current release + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TEMP TABLE gc_scv + AS + select + scv.submitter_id, + scv.variation_id, + gscv.id, + FORMAT("%%s.%%i", gscv.id, gscv.version) as scv_acxn, + IF(scv.local_key IS NULL, NULL, SPLIT(scv.local_key, "|")[0]) as local_key, + scv.local_key as local_key_orig, + scv.date_created as first_in_clinvar, + scv.classification_comment, + COUNT(IFNULL(gscv.lab_id,gscv.lab_name)) as case_count + from `%s.gc_scv` gscv + join `%s.scv_summary` scv + on + scv.id = gscv.id + where + -- these are the dupe gc submissions that are older + gscv.id not in ( + "SCV000607136","SCV000986740", + "SCV000986708","SCV000986786", + "SCV000986705","SCV000986788", + "SCV000986813","SCV000607109" + ) + group by + scv.submitter_id, + scv.variation_id, + gscv.id, + gscv.version, + scv.local_key, + scv.date_created, + scv.classification_comment + """, schema_name, schema_name); + + -- gc scv w/ agg info for current release + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TEMP TABLE gc + AS + select + gc_scv.submitter_id, + gc_scv.variation_id, + g.hgnc_id, + g.symbol, + v.name, + cvcv.agg_classification, + cvcv.rank, + gc_scv.scv_acxn, + gc_scv.local_key, + gc_scv.case_count, + gc_scv.first_in_clinvar, + gc_scv.classification_comment + from gc_scv + join `%s.variation` v + on + v.id = gc_scv.variation_id + left join `%s.single_gene_variation` sgv + on + sgv.variation_id = gc_scv.variation_id + left join `%s.gene` g + on + sgv.gene_id = g.id + join `clinvar_ingest.clinvar_vcvs` cvcv + on + cvcv.variation_id = gc_scv.variation_id + and + %T between cvcv.start_release_date and cvcv.end_release_date + """, schema_name, schema_name, schema_name, release_date); + + -- gc case info for current release + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TEMP TABLE gc_case + AS + select + gscv.variation_id, + gc_scv.scv_acxn, + gc_scv.local_key, + gc_scv.local_key_orig, + gscv.lab_name, + gscv.lab_id, + gscv.lab_classification, + gscv.lab_classif_type, + gscv.lab_date_reported, + gscv.sample_id, + IF(gscv.sample_id IS NULL, gc_scv.local_key, CONCAT(gc_scv.local_key, "|", gscv.sample_id)) as case_report_key + from gc_scv + join `%s.gc_scv` gscv + on + gc_scv.id = gscv.id + """, schema_name); + + -- gc case related lab info fo current release + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TEMP TABLE lab_case + AS + select + gc_case.variation_id, + gc_case.lab_id as submitter_id, + gc_case.case_report_key, + STRING_AGG(DISTINCT FORMAT("%%s.%%i", lab_scv.id, lab_scv.version)) as acxn, + STRING_AGG(DISTINCT lab_scv.classif_type ORDER BY lab_scv.classif_type) as classif_type, + STRING_AGG(DISTINCT lab_scv.submitted_classification ORDER BY lab_scv.submitted_classification) as classification, + MIN(lab_scv.last_evaluated) as last_evaluated, + MIN(lab_scv.date_created) as first_in_clinvar, + COUNT(DISTINCT lab_scv.id) as scv_count + from gc_case + left join `%s.scv_summary` lab_scv + on + lab_scv.submitter_id = gc_case.lab_id and + lab_scv.variation_id = gc_case.variation_id + group by + gc_case.lab_id, + gc_case.variation_id, + gc_case.case_report_key + """, schema_name); + + -- gc var report + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TEMP TABLE var + AS + WITH v AS + ( + select + gc_scv.variation_id, + COUNT(gc_scv.id) as gc_scv_count, + MIN(vcv.date_created) as first_in_clinvar + from gc_scv + join `%s.variation_archive` vcv + on + vcv.variation_id = gc_scv.variation_id + group by + gc_scv.variation_id + ) + -- variation data related to single GC submitter's submissions + select + v.variation_id, + v.first_in_clinvar, + COUNT(distinct sgrp.id) as scv_count, + v.gc_scv_count, + STRING_AGG(split( sgrp.scv_label, "%%")[0]||"%%", "\\n" ORDER BY sgrp.rank desc, sgrp.scv_group_type, sgrp.scv_label) as all_scvs + from v + join `clinvar_ingest.voi_scv_group` sgrp + on + sgrp.variation_id = v.variation_id and + %T between sgrp.start_release_date and sgrp.end_release_date + group by + v.variation_id, + v.first_in_clinvar, + v.gc_scv_count + """, schema_name, release_date); + + -- gc variation report (1 of 2) - first remove all gc_variation records for the release_date being processed + EXECUTE IMMEDIATE FORMAT(""" + DELETE FROM `variation_tracker.gc_variation` + WHERE report_date = %T + """, release_date); + + -- gc variation report (2 of 2)- now insert the newly processed records for the current release_date + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `variation_tracker.gc_variation` + ( + report_date, + submitter_id, + variation_id, + hgnc_id, + symbol, + name, + agg_classification, + rank, + clinvar_name, + submitted_classification, + classif_type, + last_evaluated, + scv_acxn, + gc_scv_first_in_clinvar, + local_key, + gc_case_count, + all_scvs, + variant_first_in_clinvar, + novel_at_first_gc_submission, + novel_as_of_report_run_date, + only_other_gc_submitters + ) + -- variant-centric output for single GC submitter + select + %T as report_date, + gc.submitter_id, + gc.variation_id, + gc.hgnc_id, + gc.symbol, + gc.name, + gc.agg_classification, + gc.rank, + vcep.clinvar_name, + vcep.submitted_classification, + vcep.classif_type, + vcep.last_evaluated, + gc.scv_acxn, + gc.first_in_clinvar as gc_scv_first_in_clinvar, + gc.local_key, + gc.case_count as gc_case_count, + var.all_scvs, + var.first_in_clinvar as variant_first_in_clinvar, + IF((var.first_in_clinvar = gc.first_in_clinvar), "Yes", "No") as novel_at_first_gc_submission, + IF((var.scv_count = 1), "Yes", "No") as novel_as_of_report_run_date, + IF((var.scv_count > 1 AND var.scv_count = var.gc_scv_count), "Yes", "No") as only_other_gc_submitters + from gc + left join vcep + on + vcep.variation_id = gc.variation_id + left join var + on + var.variation_id = gc.variation_id + -- ORDER BY 1, CAST(gc.variation_id as INT) + """, release_date); + + -- gc case report (1 of 2) - first remove all gc_case records for the release_date being processed + EXECUTE IMMEDIATE FORMAT(""" + DELETE FROM `variation_tracker.gc_case` + WHERE report_date = %T + """, release_date); + + -- gc case report (2 of 2)- now insert the newly processed records for the current release_date + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `variation_tracker.gc_case` + ( + report_date,submitter_id,variation_id,gene_id,gene_symbol,variant_name, + ep_name,ep_classification, ep_classif_type,ep_last_evaluated_date, + case_report_lab_name,case_report_lab_id,case_report_lab_classification, + case_report_lab_classif_type,case_report_lab_date_reported, + gc_scv_acxn,gc_scv_first_in_clinvar,gc_scv_local_key,case_report_sample_id, + lab_scv_classification,lab_scv_classif_type,lab_scv_last_evaluated, + lab_scv_first_in_clinvar,lab_scv_before_gc_scv,lab_scv_in_clinvar_as_of_release, + ep_diff_alert,lab_diff_alert,classification_comment + ) + select + %T as report_date, + gc.submitter_id, + gc.variation_id, + gc.hgnc_id as gene_id, + gc.symbol as gene_symbol, + gc.name as variant_name, + vcep.clinvar_name as ep_name, + vcep.submitted_classification as ep_classification, + vcep.classif_type as ep_classif_type, + vcep.last_evaluated as ep_last_evaluated_date, + gc_case.lab_name as case_report_lab_name, + gc_case.lab_id as case_report_lab_id, + gc_case.lab_classification as case_report_lab_classification, + gc_case.lab_classif_type as case_report_lab_classif_type, + gc_case.lab_date_reported as case_report_lab_date_reported, + gc.scv_acxn as gc_scv_acxn, + gc.first_in_clinvar as gc_scv_first_in_clinvar, + gc.local_key as gc_scv_local_key, + gc_case.sample_id as case_report_sample_id, + -- classification + lab_case.classification as lab_scv_classification, + -- classification type + lab_case.classif_type as lab_scv_classif_type, + -- last eval'd + lab_case.last_evaluated as lab_scv_last_evaluated, + -- do not show lab_scv_first_in_clinvar unless the lab_scv_count is 1 + IF(lab_case.scv_count=1,lab_case.first_in_clinvar, null) as lab_scv_first_in_clinvar, + -- show error if more than 1 scv exists on variant for case report submitter + CASE lab_case.scv_count + WHEN 0 THEN + null + WHEN 1 THEN + IF(gc.first_in_clinvar <= lab_case.first_in_clinvar, "No", "Yes") + ELSE + "Error: multiple lab scvs." + END as lab_scv_before_gc_scv, + -- is lab_case.scv_count = 1 then the lab scv is submitted at time of clinvar release, error if more than one scv from lab in release + CASE lab_case.scv_count + WHEN 0 THEN + null + WHEN 1 THEN + "Yes" + ELSE + "Error: multiple lab scvs." + END as lab_scv_in_clinvar_as_of_release, + -- alert for VCEP diff, show null if no vcep scv or if VCEP classification exactly matches GC CASE report classification + CASE + WHEN vcep.classif_type IS NULL THEN + null + WHEN (IFNULL(gc_case.lab_classif_type,"n/a") <> vcep.classif_type) THEN + FORMAT("%%s vs %%s (%%s)", + UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), + UPPER(vcep.classif_type), + IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > vcep.last_evaluated, "<",">")) + ) + ELSE + null + END as ep_diff_alert, + -- alert for LAB diff, show null if no vcep scv or if LAB classification exactly matches GC CASE report classification + -- show error if more than 1 scv exists on variant for case report submitter + CASE + WHEN lab_case.scv_count=1 AND (IFNULL(gc_case.lab_classif_type,"n/a") <> lab_case.classif_type) THEN + FORMAT("%%s vs %%s (%%s)", + UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), + UPPER(lab_case.classif_type), + IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > lab_case.last_evaluated, "<",">")) + ) + WHEN lab_case.scv_count > 1 THEN + -- error + "Error: multiple lab scvs." + ELSE + -- lab_case count = 0 OR gc_case and lab_case classifications match so do nothing + null + END as lab_diff_alert, + gc.classification_comment + from gc + left join vcep + on + vcep.variation_id = gc.variation_id + left join gc_case + on + gc.scv_acxn = gc_case.scv_acxn + left join lab_case + on + lab_case.variation_id = gc_case.variation_id and + lab_case.case_report_key = gc_case.case_report_key + """, release_date); + +-- -- gc alerts? (TODO) + +END; diff --git a/scripts/tracker-procs/initialize-tracker-tables.sql b/scripts/tracker-report-update/initialize-tracker-tables.sql similarity index 100% rename from scripts/tracker-procs/initialize-tracker-tables.sql rename to scripts/tracker-report-update/initialize-tracker-tables.sql diff --git a/scripts/tracker-report-update/tracker-report-update-proc.sql b/scripts/tracker-report-update/tracker-report-update-proc.sql new file mode 100644 index 0000000..6130f4a --- /dev/null +++ b/scripts/tracker-report-update/tracker-report-update-proc.sql @@ -0,0 +1,18 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.tracker_report_update`( + on_date DATE +) +BEGIN + FOR rec IN ( + select + s.schema_name, + s.release_date, + s.prev_release_date, + s.next_release_date + FROM clinvar_ingest.schema_on(on_date) as s + ) + DO + CALL `variation_tracker.report_variation_proc`(); + CALL `variation_tracker.tracker_reports_rebuild`(); + CALL `variation_tracker.gc_tracker_report_rebuild`(rec.scheman_name, rec.release_date); + END FOR; +END; \ No newline at end of file