diff --git a/scripts/dataset-preparation/00-setup-translation-tables.sql b/scripts/dataset-preparation/00-setup-translation-tables.sql index 520d522..05a1622 100644 --- a/scripts/dataset-preparation/00-setup-translation-tables.sql +++ b/scripts/dataset-preparation/00-setup-translation-tables.sql @@ -81,6 +81,39 @@ VALUES ('SomaticClinicalImpact', 't3', 'Tier III - Unknown', 1, 'somatic', 20, 20, 'somatic', 20, 20, 'none', 'cg000103', 'inconclusive', 'cg000025', null), ('SomaticClinicalImpact', 't4', 'Tier IV - Benign/Likely benign', 0, 'somatic', 32, 32, 'somatic', 32, 32, 'refutes', 'cg000102', 'likely', 'cg000026', null); +-- drop the non-GERMLINE rows from the clinsig_types table (on stage only) +BEGIN + DECLARE project_id STRING; + + SET project_id = (SELECT + catalog_name as paroject_id + FROM `INFORMATION_SCHEMA.SCHEMATA` + WHERE schema_name = 'clinvar_ingest'); + + IF (project_id = 'clingen_stage') THEN + CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_clinsig_types` + AS + SELECT + code, + label, + significance, + original_proposition_type, + original_code_order, + original_description_order, + gks_proposition_type, + gks_code_order, + gks_description_order, + direction, + strength_code, + strength_label, + classification_code, + penetrance_level + FROM `clinvar_ingest.clinvar_clinsig_types` + WHERE statement_type = 'GermlineClassification'; + + END IF; + +END; CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_proposition_types` ( code STRING, diff --git a/scripts/dataset-preparation/02-normalize-ds-v2-proc.sql b/scripts/dataset-preparation/02-normalize-ds-v2-proc.sql index 6da57dd..1141e6f 100644 --- a/scripts/dataset-preparation/02-normalize-ds-v2-proc.sql +++ b/scripts/dataset-preparation/02-normalize-ds-v2-proc.sql @@ -14,14 +14,18 @@ BEGIN IF NOT column_exists THEN -- backup the original clinical_assertion table EXECUTE IMMEDIATE FORMAT(""" - CREATE TABLE `%s.backup_clinical_assertion` AS - SELECT * FROM `%s.clinical_assertion` + CREATE TABLE `%s.backup_clinical_assertion` + AS + SELECT + * + FROM `%s.clinical_assertion` """, schema_name, schema_name); -- create or replace the clinical_assertion table from the backup EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.clinical_assertion` AS + CREATE OR REPLACE TABLE `%s.clinical_assertion` + AS SELECT *, 'GermlineClassification' as statement_type, @@ -39,13 +43,17 @@ BEGIN IF NOT table_exists THEN -- backup the original rcv_accession table EXECUTE IMMEDIATE FORMAT(""" - CREATE TABLE `%s.backup_rcv_accession` AS - SELECT * FROM `%s.rcv_accession` + CREATE TABLE `%s.backup_rcv_accession` + AS + SELECT + * + FROM `%s.rcv_accession` """, schema_name, schema_name); -- create the rcv_accession_classification table from the backup EXECUTE IMMEDIATE FORMAT(""" - CREATE TABLE %s.rcv_accession_classification AS + CREATE TABLE %s.rcv_accession_classification + AS SELECT release_date, id as rcv_id, @@ -65,7 +73,8 @@ BEGIN -- create or replace the rcv_accession_classification table from the backup EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.rcv_accession` AS + CREATE OR REPLACE TABLE `%s.rcv_accession` + AS SELECT release_date, id, @@ -87,13 +96,17 @@ BEGIN IF NOT column_exists THEN -- backup the original rcv_accession_classification table EXECUTE IMMEDIATE FORMAT(""" - CREATE TABLE `%s.backup_rcv_accession_classification` AS - SELECT * FROM `%s.rcv_accession_classification` + CREATE TABLE `%s.backup_rcv_accession_classification` + AS + SELECT + * + FROM `%s.rcv_accession_classification` """, schema_name, schema_name); -- create or replace the rcv_accession_classification table from the backup EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.rcv_accession_classification` AS + CREATE OR REPLACE TABLE `%s.rcv_accession_classification` + AS SELECT release_date, rcv_id, @@ -106,25 +119,27 @@ BEGIN REGEXP_REPLACE(content, r'"Description"\\s*\\:\\s*"[^"]+"\\s*,*\\s*', "") ) as content FROM `%s.rcv_accession_classification` - WHERE content is not null + WHERE + content is not null UNION ALL SELECT - release_date, - rcv_id, - statement_type, - review_status, - [ - STRUCT( - clinical_impact_assertion_type, - clinical_impact_clinical_significance, - date_last_evaluated, - num_submissions, - interp_description - ) - ] as agg_classification, - content + release_date, + rcv_id, + statement_type, + review_status, + [ + STRUCT( + clinical_impact_assertion_type, + clinical_impact_clinical_significance, + date_last_evaluated, + num_submissions, + interp_description + ) + ] as agg_classification, + content FROM `%s.rcv_accession_classification` - WHERE content is null + WHERE + content is null """, schema_name, schema_name, schema_name); END IF; END IF; @@ -137,13 +152,17 @@ BEGIN IF NOT table_exists THEN -- backup the original variation_archive table EXECUTE IMMEDIATE FORMAT(""" - CREATE TABLE `%s.backup_variation_archive` AS - SELECT * FROM `%s.variation_archive` + CREATE TABLE `%s.backup_variation_archive` + AS + SELECT + * + FROM `%s.variation_archive` """, schema_name, schema_name); -- create the variation_archive_classification table from the backup EXECUTE IMMEDIATE FORMAT(""" - CREATE TABLE %s.variation_archive_classification AS + CREATE TABLE %s.variation_archive_classification + AS SELECT id as vcv_id, 'GermlineClassification' AS statement_type, @@ -163,7 +182,8 @@ BEGIN -- create or replace the variation_archive table from the backup EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.variation_archive` AS + CREATE OR REPLACE TABLE `%s.variation_archive` + AS SELECT date_created, record_status, diff --git a/scripts/dataset-preparation/03-scv-summary-proc.sql b/scripts/dataset-preparation/03-scv-summary-proc.sql index 985fe40..d460dea 100644 --- a/scripts/dataset-preparation/03-scv-summary-proc.sql +++ b/scripts/dataset-preparation/03-scv-summary-proc.sql @@ -3,7 +3,8 @@ CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary`( ) BEGIN EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.scv_summary` AS + CREATE OR REPLACE TABLE `%s.scv_summary` + AS WITH obs_sample AS ( SELECT REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, @@ -48,8 +49,12 @@ BEGIN STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type FROM `%s.clinical_assertion` ca - LEFT JOIN obs_sample os ON os.id = ca.id - LEFT JOIN obs_method om ON om.id = ca.id + LEFT JOIN obs_sample os + ON + os.id = ca.id + LEFT JOIN obs_method om + ON + om.id = ca.id GROUP BY ca.id ), diff --git a/scripts/dataset-preparation/03-scv-summary-v2-proc.sql b/scripts/dataset-preparation/03-scv-summary-v2-proc.sql index 6e52934..89b878c 100644 --- a/scripts/dataset-preparation/03-scv-summary-v2-proc.sql +++ b/scripts/dataset-preparation/03-scv-summary-v2-proc.sql @@ -3,7 +3,8 @@ CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary_v2`( ) BEGIN EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.scv_summary` AS + CREATE OR REPLACE TABLE `%s.scv_summary` + AS WITH obs_sample AS ( SELECT REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, @@ -47,8 +48,12 @@ BEGIN STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type FROM `%s.clinical_assertion` ca - LEFT JOIN obs_sample os ON os.id = ca.id - LEFT JOIN obs_method om ON om.id = ca.id + LEFT JOIN obs_sample os + ON + os.id = ca.id + LEFT JOIN obs_method om + ON + om.id = ca.id GROUP BY ca.id ), @@ -59,7 +64,8 @@ BEGIN FROM `%s.clinical_assertion` ca LEFT JOIN UNNEST(ca.interpretation_comments) as c - WHERE ARRAY_LENGTH(ca.interpretation_comments) > 0 + WHERE + ARRAY_LENGTH(ca.interpretation_comments) > 0 GROUP BY id ) diff --git a/scripts/dataset-preparation/04-single-gene-variation-proc.sql b/scripts/dataset-preparation/04-single-gene-variation-proc.sql index c162827..260bb6b 100644 --- a/scripts/dataset-preparation/04-single-gene-variation-proc.sql +++ b/scripts/dataset-preparation/04-single-gene-variation-proc.sql @@ -8,14 +8,15 @@ BEGIN --- step 1. create a table with columns variation_id, gene_id, somatic_flag --- where the variation_id is the pk. EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.single_gene_variation` - ( release_date DATE NOT NULL, - variation_id STRING NOT NULL, - gene_id STRING NOT NULL, - relationship_type STRING, - source STRING, - mane_select BOOL DEFAULT FALSE, - somatic BOOL DEFAULT FALSE) + CREATE OR REPLACE TABLE `%s.single_gene_variation`( + release_date DATE NOT NULL, + variation_id STRING NOT NULL, + gene_id STRING NOT NULL, + relationship_type STRING, + source STRING, + mane_select BOOL DEFAULT FALSE, + somatic BOOL DEFAULT FALSE + ) """, schema_name); -- create a temp table that is the list of remaining variations so as to reduce the query cost of analyzing against the variation table. @@ -60,7 +61,7 @@ BEGIN LEFT JOIN `%s.gene_association` ga ON ga.gene_id = g.gene_id - and + AND ga.variation_id = v.id """, schema_name, release_date, schema_name); @@ -98,20 +99,28 @@ BEGIN -- clinvar perferred label hgvs-style format NM_0000.0(GENE):c.234... (not mane select but still in name) EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( + INSERT INTO `%s.single_gene_variation`( + release_date, + variation_id, + gene_id, + relationship_type, + source + ) + WITH x AS ( SELECT v.id, v.name, REGEXP_EXTRACT(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') as symbol FROM _SESSION.temp_variation v - WHERE REGEXP_CONTAINS(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') AND + WHERE + REGEXP_CONTAINS(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') + AND NOT EXISTS ( - SELECT sgv.variation_id + SELECT + sgv.variation_id FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id + WHERE + sgv.variation_id = v.id ) ) SELECT @@ -121,27 +130,42 @@ BEGIN IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, IFNULL(ga.source,'cvc calculated') as source FROM x - JOIN `clinvar_ingest.entrez_gene` g on UPPER(g.symbol_from_authority) = UPPER(x.symbol) AND NOT REGEXP_CONTAINS(x.symbol, r'\\-AS\\d$') - LEFT JOIN _SESSION.temp_gene_assoc ga on ga.variation_id = x.id and ga.gene_id = g.gene_id + JOIN `clinvar_ingest.entrez_gene` g + ON + UPPER(g.symbol_from_authority) = UPPER(x.symbol) + AND + NOT REGEXP_CONTAINS(x.symbol, r'\\-AS\\d$') + LEFT JOIN _SESSION.temp_gene_assoc ga + ON + ga.variation_id = x.id + AND + ga.gene_id = g.gene_id """, schema_name, schema_name, release_date); - -- star allele format, CYP2C19*10. EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( + INSERT INTO `%s.single_gene_variation`( + release_date, + variation_id, + gene_id, + relationship_type, + source + ) + WITH x AS ( SELECT v.id, v.name, REGEXP_EXTRACT(v.name, r'^([A-Za-z0-9\\-]+)[\\*\\,]') as symbol FROM _SESSION.temp_variation v - WHERE REGEXP_CONTAINS(v.name, r'^([A-Za-z0-9\\-]+)[\\*\\,]') AND + WHERE + REGEXP_CONTAINS(v.name, r'^([A-Za-z0-9\\-]+)[\\*\\,]') + AND NOT EXISTS ( - SELECT sgv.variation_id + SELECT + sgv.variation_id FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id + WHERE + sgv.variation_id = v.id ) ) SELECT @@ -151,25 +175,37 @@ BEGIN IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, IFNULL(ga.source,'cvc calculated') as source FROM x - JOIN `clinvar_ingest.entrez_gene` g ON UPPER(x.symbol) = UPPER(g.symbol_from_authority) - LEFT JOIN _SESSION.temp_gene_assoc ga ON x.id = ga.variation_id AND g.gene_id = ga.gene_id + JOIN `clinvar_ingest.entrez_gene` g + ON + UPPER(x.symbol) = UPPER(g.symbol_from_authority) + LEFT JOIN _SESSION.temp_gene_assoc ga + ON + x.id = ga.variation_id + AND + g.gene_id = ga.gene_id """, schema_name, schema_name, release_date); --- step 3. for any variations remaining... load all variations with any --- genes that are mapped one-to-one from the gene association table EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( + INSERT INTO `%s.single_gene_variation`( + release_date, + variation_id, + gene_id, + relationship_type, + source + ) + WITH x AS ( SELECT v.id FROM _SESSION.temp_variation v WHERE NOT EXISTS ( - SELECT sgv.variation_id + SELECT + sgv.variation_id FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id + WHERE + sgv.variation_id = v.id ) ) SELECT @@ -179,26 +215,36 @@ BEGIN STRING_AGG(ga.relationship_type), STRING_AGG(ga.source) FROM x - JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id - group by ga.variation_id - having count(distinct ga.gene_id) = 1 + JOIN _SESSION.temp_gene_assoc ga + ON + x.id = ga.variation_id + GROUP BY + ga.variation_id + HAVING + count(distinct ga.gene_id) = 1 """, schema_name, schema_name, release_date); --- step 4. for any variations remaining... load any variant with one submitted gene that --- is not either "genes overlapped by variant" or "asserted, but not computed" EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source ) - WITH x AS - ( + INSERT INTO `%s.single_gene_variation` ( + release_date, + variation_id, + gene_id, + relationship_type, + source + ) + WITH x AS ( SELECT v.id FROM _SESSION.temp_variation v WHERE NOT EXISTS ( - SELECT sgv.variation_id + SELECT + sgv.variation_id FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id + WHERE + sgv.variation_id = v.id ) ) SELECT @@ -208,29 +254,40 @@ BEGIN STRING_AGG(ga.relationship_type), STRING_AGG(ga.source) FROM x - JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id + JOIN _SESSION.temp_gene_assoc ga + ON + x.id = ga.variation_id WHERE - ga.relationship_type not in ('genes overlapped by variant' , 'asserted, but not computed') AND + ga.relationship_type not in ('genes overlapped by variant' , 'asserted, but not computed') + AND ga.source = 'submitted' - GROUP BY ga.variation_id - HAVING count(distinct ga.gene_id) = 1 + GROUP BY + ga.variation_id + HAVING + count(distinct ga.gene_id) = 1 """, schema_name, schema_name, release_date); --- step 5. for any variations remaining... load any variations with a "within single gene" --- as long as it is associated to only one gene for that variant EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( + INSERT INTO `%s.single_gene_variation` ( + release_date, + variation_id, + gene_id, + relationship_type, + source + ) + WITH x AS ( SELECT v.id FROM _SESSION.temp_variation v WHERE NOT EXISTS ( - SELECT sgv.variation_id + SELECT + sgv.variation_id FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id + WHERE + sgv.variation_id = v.id ) ) SELECT @@ -240,26 +297,37 @@ BEGIN STRING_AGG(ga.relationship_type), STRING_AGG(ga.source) FROM x - JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id - WHERE (ga.relationship_type = 'within single gene') - GROUP BY ga.variation_id - having count(ga.gene_id) = 1 + JOIN _SESSION.temp_gene_assoc ga + ON + x.id = ga.variation_id + WHERE + ga.relationship_type = 'within single gene' + GROUP BY + ga.variation_id + HAVING + count(ga.gene_id) = 1 """, schema_name, schema_name, release_date); --- last step. for any variations remaining... load any haplotype or genotype variations only if all the children have the same gene_id EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source ) - WITH x AS - ( + INSERT INTO `%s.single_gene_variation` ( + release_date, + variation_id, + gene_id, + relationship_type, + source + ) + WITH x AS ( SELECT v.id FROM _SESSION.temp_variation v WHERE NOT EXISTS ( - SELECT sgv.variation_id + SELECT + sgv.variation_id FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id + WHERE + sgv.variation_id = v.id ) ) SELECT @@ -269,15 +337,24 @@ BEGIN 'association not provided by clinvar' as relationship_type, 'cvc calculated' as source FROM x - JOIN _SESSION.temp_variation v ON x.id = v.id + JOIN _SESSION.temp_variation v + ON + x.id = v.id CROSS JOIN UNNEST(v.descendant_ids) AS descendant_id - JOIN _SESSION.temp_variation d ON - d.id = descendant_id AND - d.subclass_type='SimpleAllele' - LEFT JOIN `%s.gene_association` ga on ga.variation_id = d.id - WHERE ARRAY_LENGTH(v.descendant_ids) > 0 - GROUP BY v.id - HAVING COUNT(ga.gene_id) = 1 + JOIN _SESSION.temp_variation d + ON + d.id = descendant_id + AND + d.subclass_type='SimpleAllele' + LEFT JOIN `%s.gene_association` ga + ON + ga.variation_id = d.id + WHERE + ARRAY_LENGTH(v.descendant_ids) > 0 + GROUP BY + v.id + HAVING + COUNT(ga.gene_id) = 1 """, schema_name, schema_name, release_date, schema_name); --- Finally, update somatic flags based on current onco-gene list (should this be a @@ -285,11 +362,17 @@ BEGIN EXECUTE IMMEDIATE FORMAT(""" UPDATE `%s.single_gene_variation` sgv SET sgv.somatic = TRUE - WHERE EXISTS ( - SELECT cg.hgnc_id from `clinvar_ingest.cancer_genes` cg - join `clinvar_ingest.entrez_gene` g on g.hgnc_id = cg.hgnc_id - WHERE g.gene_id = sgv.gene_id - ) + WHERE + EXISTS ( + SELECT + cg.hgnc_id + FROM `clinvar_ingest.cancer_genes` cg + JOIN `clinvar_ingest.entrez_gene` g + ON + g.hgnc_id = cg.hgnc_id + WHERE + g.gene_id = sgv.gene_id + ) """, schema_name); DROP TABLE _SESSION.temp_variation; diff --git a/scripts/dataset-preparation/05-gc-scv-proc.sql b/scripts/dataset-preparation/05-gc-scv-proc.sql index 31cb38e..8e19725 100644 --- a/scripts/dataset-preparation/05-gc-scv-proc.sql +++ b/scripts/dataset-preparation/05-gc-scv-proc.sql @@ -43,7 +43,7 @@ BEGIN JOIN `%s.scv_summary` scv ON rs.submitter_id = scv.submitter_id - and + AND rs.type = 'GC' CROSS JOIN UNNEST(scv.clinical_assertion_observation_ids) as cao_id JOIN `%s.clinical_assertion_observation` cao diff --git a/scripts/temporal-data-collection/00-setup-temporal-tables.sql b/scripts/temporal-data-collection/00-setup-temporal-tables.sql index f0b9eb4..2698076 100644 --- a/scripts/temporal-data-collection/00-setup-temporal-tables.sql +++ b/scripts/temporal-data-collection/00-setup-temporal-tables.sql @@ -40,6 +40,159 @@ CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_submitters` deleted_count INT DEFAULT 0 ); +-- ***************** clinvar_variations ***************** +CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_variations` +( + id STRING NOT NULL, + name STRING, + start_release_date DATE, + end_release_date DATE, + deleted_release_date DATE, + deleted_count INT DEFAULT 0 +); + +-- drop the non-GERMLINE rows from the clinsig_types table (on stage only) +BEGIN + DECLARE project_id STRING; + + SET project_id = + ( + SELECT + catalog_name as paroject_id + FROM `INFORMATION_SCHEMA.SCHEMATA` + WHERE + schema_name = 'clinvar_ingest' + ); + + IF (project_id = 'clingen_stage') THEN + -- original tables before the new clinvar XML was introduced + + -- ***************** clinvar_vcvs ***************** + CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_vcvs` + ( + variation_id STRING NOT NULL, + id STRING NOT NULL, + version INT NOT NULL, + rank INT NOT NULL, + last_evaluated DATE, + agg_classification STRING, + start_release_date DATE, + end_release_date DATE, + deleted_release_date DATE, + deleted_count INT DEFAULT 0 + ); + + -- ***************** clinvar_scvs ***************** + CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_scvs` + ( + variation_id STRING NOT NULL, + id STRING NOT NULL, + version INT NOT NULL, + rpt_stmt_type STRING NOT NULL, + rank INT NOT NULL, + last_evaluated DATE, + local_key STRING, + classif_type STRING, + clinsig_type INT, + submitted_classification STRING, + submitter_id STRING, + submission_date DATE, + origin STRING, + affected_status STRING, + method_type STRING, + start_release_date DATE, + end_release_date DATE, + deleted_release_date DATE, + deleted_count INT DEFAULT 0 + ); + + ELSE + + -- ***************** clinvar_vcvs ***************** + CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_vcvs` + ( + variation_id STRING NOT NULL, + id STRING NOT NULL, + version INT64 NOT NULL, + start_release_date DATE, + end_release_date DATE, + deleted_release_date DATE, + deleted_count INT64 DEFAULT 0 + ); + + -- ***************** clinvar_vcv_classifications ***************** + CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_vcv_classifications` + ( + vcv_id STRING NOT NULL, + statement_type STRING NOT NULL, + rank INT64 NOT NULL, + last_evaluated DATE, + agg_classification_description STRING, + num_submitters INT64, + num_submissions INT64, + most_recent_submission DATE, + clinical_impact_assertion_type STRING, + clinical_impact_clinical_significance STRING, + start_release_date DATE, + end_release_date DATE, + deleted_release_date DATE, + deleted_count INT64 DEFAULT 0 + ); + + -- ***************** clinvar_scvs ***************** + CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_scvs` + ( + variation_id STRING NOT NULL, + id STRING NOT NULL, + version INT NOT NULL, + statement_type STRING NOT NULL, + original_proposition_type STRING, + gks_proposition_type STRING, + clinical_impact_assertion_type STRING, + clinical_impact_clinical_significance STRING, + rank INT NOT NULL, + last_evaluated DATE, + local_key STRING, + classif_type STRING, + clinsig_type INT, + submitted_classification STRING, + submitter_id STRING, + submission_date DATE, + origin STRING, + affected_status STRING, + method_type STRING, + start_release_date DATE, + end_release_date DATE, + deleted_release_date DATE, + deleted_count INT DEFAULT 0 + ); + + END IF; + +END; + +-- ***************** clinvar_gc_scvs ***************** +CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_gc_scvs` +( + variation_id STRING NOT NULL, + id STRING NOT NULL, + version INT NOT NULL, + submitter_id STRING, + method_desc STRING, + method_type STRING, + lab_name STRING, + lab_date_reported DATE, + lab_id STRING, + lab_classification STRING, + lab_classif_type STRING, + lab_type STRING, + sample_id STRING, + start_release_date DATE, + end_release_date DATE, + deleted_release_date DATE, + deleted_count INT DEFAULT 0 +); + -- -- initialize submitter info by release based on clinical_assertion release info, -- -- 36 very old submitter ids existed before 2019-07-01 which need to be manually @@ -268,16 +421,7 @@ CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_submitters` -- VALUES ("505239", "ISCA site 13", "other") --- ***************** clinvar_variations ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_variations` -( - id STRING NOT NULL, - name STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT DEFAULT 0 -); + -- housekeeping issue! -- -- remove duplicate variation records by replacing variation view with a table from 2022_07_24 dataset @@ -322,50 +466,6 @@ CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_variations` -- WHERE v2.release_date = v.release_date AND v2.id = v.id AND v2.first_name <> v.name -- ); --- ***************** clinvar_vcvs ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_vcvs` -( - variation_id STRING NOT NULL, - id STRING NOT NULL, - version INT NOT NULL, - rank INT NOT NULL, - last_evaluated DATE, - agg_classification STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT DEFAULT 0 -); - --- ***************** clinvar_scvs ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_scvs` -( - variation_id STRING NOT NULL, - id STRING NOT NULL, - version INT NOT NULL, - statement_type STRING NOT NULL, - original_proposition_type STRING, - gks_proposition_type STRING, - clinical_impact_assertion_type STRING, - clinical_impact_clinical_significance STRING, - rank INT NOT NULL, - last_evaluated DATE, - local_key STRING, - classif_type STRING, - clinsig_type INT, - submitted_classification STRING, - submitter_id STRING, - submission_date DATE, - origin STRING, - affected_status STRING, - method_type STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT DEFAULT 0 -); - - -- one variant processing of SCVs per release -- -- housekeeping, remove any duplicate rows in scv.summary for the snapshot dbs clinvar_2019_06_01_v0, @@ -397,93 +497,4 @@ CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_scvs` -- FROM `clinvar_2022_07_24_v1_6_46.scv_summary` -- ) -- WHERE row_number = 1 --- ; - --- ***************** clinvar_gc_scvs ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_gc_scvs` -( - variation_id STRING NOT NULL, - id STRING NOT NULL, - version INT NOT NULL, - submitter_id STRING, - method_desc STRING, - method_type STRING, - lab_name STRING, - lab_date_reported DATE, - lab_id STRING, - lab_classification STRING, - lab_classif_type STRING, - lab_type STRING, - sample_id STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT DEFAULT 0 -); - - --- ***************** clinvar_var_scv_change ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_var_scv_change` -( - variation_id STRING NOT NULL, - start_release_date DATE, - end_release_date DATE -); - - - - -CREATE OR REPLACE TABLE `clinvar_ingest.voi_group` -( - start_release_date DATE, - end_release_date DATE, - variation_id STRING, - rpt_stmt_type STRING, - rank INT64, - unique_clinsig_type_count INT64, - agg_sig_type INT64, - sig_type ARRAY>, - max_last_evaluated DATE, - max_submission_date DATE, - submission_count INT64, - submitter_count INT64, - agg_classif STRING, - agg_classif_w_count STRING -); - - -CREATE OR REPLACE TABLE `clinvar_ingest.voi_scv` -( - variation_id STRING, - id STRING, - version INT64, - full_scv_id STRING, - rpt_stmt_type STRING, - rank INT64, - last_evaluated DATE, - classif_type STRING, - submitted_classification STRING, - clinsig_type INT64, - classification_label STRING, - classification_abbrev STRING, - submitter_id STRING, - submitter_name STRING, - submitter_abbrev STRING, - submission_date DATE, - origin STRING, - affected_status STRING, - method_type STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 -); - -CREATE OR REPLACE TABLE `clinvar_ingest.voi_top_group_change` -( - variation_id STRING, - rpt_stmt_type STRING, - top_rank INT64, - start_release_date DATE, - end_release_date DATE -); \ No newline at end of file +-- ; \ No newline at end of file diff --git a/scripts/temporal-data-collection/00-setup-temporal-v2-tables.sql b/scripts/temporal-data-collection/00-setup-temporal-v2-tables.sql deleted file mode 100644 index b27950c..0000000 --- a/scripts/temporal-data-collection/00-setup-temporal-v2-tables.sql +++ /dev/null @@ -1,504 +0,0 @@ --- ***************** clinvar_genes & clinvar_single_gene_variations ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_single_gene_variations` -( - variation_id STRING NOT NULL, - gene_id STRING NOT NULL, - relationship_type STRING, - source STRING, - mane_select BOOLEAN DEFAULT FALSE, - somatic BOOLEAN DEFAULT FALSE, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_genes` -( - id STRING NOT NULL, - symbol STRING, - hgnc_id STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - --- ***************** clinvar_submitters ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_submitters` -( - id STRING, - current_name STRING, - current_abbrev STRING, - cvc_abbrev STRING, - org_category STRING, - all_names ARRAY, - all_abbrevs ARRAY, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - - --- -- initialize submitter info by release based on clinical_assertion release info, --- -- 36 very old submitter ids existed before 2019-07-01 which need to be manually --- -- loaded and provided to create the full submitter table pre-2019-07-01 - --- CREATE or REPLACE TABLE `clinvar_2019_06_01_v0.pre_2019_07_01_submitter` --- ( --- id STRING, --- current_name STRING, --- current_abbrev STRING, --- cvc_abbrev STRING, --- org_category STRING, --- all_names ARRAY, --- all_abbrevs ARRAY --- ); - --- INSERT `clinvar_2019_06_01_v0.pre_2019_07_01_submitter` (id, current_name, current_abbrev, org_category) --- VALUES("1237", "Cincinnati Children's Hospital Medical Center", null, "other"), --- ("279559", "Centogene AG - the Rare Disease Company", null, "other"), --- ("500168", "Samuels Laboratory (NHGRI/NIH)", null, "other"), --- ("500266", "Precancer Genomics (Leeds Institute of Molecular Medicine)", null, "other"), --- ("504846", "", null, "other"), --- ("504870", "", null, "other"), --- ("504875", "Human and Clinical Genetics", null, "other"), --- ("505190", "", null, "other"), --- ("505191", "", null, "other"), --- ("505192", "", null, "other"), --- ("505193", "", null, "other"), --- ("505194", "", null, "other"), --- ("505204", "", null, "other"), --- ("505225", "", null, "other"), --- ("505229", "", null, "other"), --- ("505239", "ISCA Site 13", null, "other"), --- ("505261", "", null, "other"), --- ("505326", "Yardena Samuels Lab (NHGRI)", null, "other"), --- ("505327", "", null, "other"), --- ("505355", "", null, "other"), --- ("505406", "", null, "other"), --- ("505428", "", null, "other"), --- ("505449", "", null, "other"), --- ("505508", "", null, "other"), --- ("505521", "", null, "other"), --- ("505557", "", null, "other"), --- ("505607", "", null, "other"), --- ("505649", "Department of Genetics (University Medical Center Groningen)", null, "other"), --- ("505655", "", null, "other"), --- ("505689", "", null, "other"), --- ("505694", "", null, "other"), --- ("506099", "Genome.One, G1", "G1", "other"), --- ("506309", "", null, "other"), --- ("506354", "", null, "other"), --- ("506387", "", null, "other"), --- ("507238", "", null, "other"), --- ("9999990", "", null, "other"), --- ("9999991", "", null, "other") --- ; - --- create or replace table `clinvar_2019_06_01_v0.submitter` --- as --- with ca as ( --- select --- ca.release_date, --- ca.submitter_id --- from `clinvar_2019_06_01_v0.clinical_assertion` ca --- group by ca.release_date, ca.submitter_id --- ) --- select --- ca.release_date, --- ca.submitter_id, --- s.current_name, --- s.current_abbrev, --- s.org_category, --- s.all_names, --- s.all_abbrevs --- from ca --- join `clinvar_2019_06_01_v0.pre_2019_07_01_submitter` s --- on --- s.id = ca.submitter_id --- union all --- select --- ca.release_date, --- ca.submitter_id, --- s.current_name, --- s.current_abbrev, --- s.org_category, --- s.all_names, --- s.all_abbrevs --- from ca --- join `clinvar_2019_07_01_v1_1_0_m2.submitter` s --- on --- s.id = ca.submitter_id --- ; - - --- ********************** additional working data concerns below ************* - --- select distinct ss.submitter_id, ss.id, ss.variation_id, ss.release_date --- from `clinvar_2019_06_01_v0.scv_summary` ss --- where not exists ( --- select id --- from `clinvar_ingest.clinvar_submitters` cs --- where cs.id = ss.submitter_id --- ) --- and ss.release_date = DATE '2015-03-06' --- -- group by ss.submitter_id --- order by 1 - --- -- repair bad submitter ids pre-201907 --- UPDATE `clinvar_2019_06_01_v0.scv_summary` --- SET scv.submitter_id = vals.good_id --- FROM ( --- SELECT '1' bad_id, '500139' good_id UNION ALL --- SELECT '500006' bad_id, '506018' good_id UNION ALL --- SELECT '500007' bad_id, '506018' good_id UNION ALL --- SELECT '500008' bad_id, '506018' good_id UNION ALL --- SELECT '500009' bad_id, '506018' good_id UNION ALL --- SELECT '500010' bad_id, '506018' good_id UNION ALL --- SELECT '500011' bad_id, '506018' good_id UNION ALL --- SELECT '500064' bad_id, '1160' good_id UNION ALL --- SELECT '500166' bad_id, '500133' good_id UNION ALL --- SELECT '505708' bad_id, '506834' good_id UNION ALL --- SELECT '505333' bad_id, '1006' good_id UNION ALL --- SELECT '505345' bad_id, '25969' good_id UNION ALL --- SELECT '505346' bad_id, '505572' good_id UNION ALL --- SELECT '505363' bad_id, '320418' good_id UNION ALL --- SELECT '500121' bad_id, '506047' good_id UNION ALL --- SELECT '500129' bad_id, '505260' good_id UNION ALL --- SELECT '500145' bad_id, '25969' good_id UNION ALL --- SELECT '505751' bad_id, '505642' good_id UNION ALL --- SELECT '505978' bad_id, '506617' good_id UNION ALL --- SELECT '506000' bad_id, '506627' good_id UNION ALL --- SELECT '504961' bad_id, '1006' good_id UNION ALL --- SELECT '504815' bad_id, '506543' good_id UNION ALL --- SELECT '500265' bad_id, '500126' good_id UNION ALL --- SELECT '500293' bad_id, '507238' good_id UNION ALL --- SELECT '500313' bad_id, '1238' good_id UNION ALL --- SELECT '504819' bad_id, '504864' good_id UNION ALL --- SELECT 'Sharing Clinical Report Project' bad_id, '500037' good_id UNION ALL --- SELECT 'Sharing Clinical Report Project (SCRP)' bad_id, '500037' good_id UNION ALL --- SELECT 'ISCA Consortium' bad_id, '505237' good_id UNION ALL --- SELECT 'ARUP' bad_id, '506018' good_id UNION ALL --- SELECT 'LabCorp' bad_id, '500026' good_id UNION ALL --- SELECT '505239' bad_id, '319864' good_id UNION ALL --- SELECT 'Emory Genetics Laboratory' bad_id, '500060' good_id UNION ALL --- SELECT 'Ambry Genetics,Ambry Genetics Corp' bad_id, '61756' good_id UNION ALL --- SELECT '505689' bad_id, '505820' good_id UNION ALL --- SELECT '504846' bad_id, '505291' good_id UNION ALL --- SELECT '505229' bad_id, '505721' good_id UNION ALL --- SELECT '505508' bad_id, '505641' good_id --- ) vals --- WHERE vals.bad_id = submitter_id --- ; - --- additional scripts to clean up bogus or modified submitter_ids lost over time --- SELECT id, submitter_ids --- FROM ( --- SELECT id, array_agg(distinct submitter_id) as submitter_ids --- FROM `clinvar_ingest.clinvar_scvs` --- group by id --- HAVING COUNT(distinct submitter_id) > 1 --- ) --- ; - --- update `clinvar_2019_06_01_v0.scv_summary` ss --- set submitter_id = "26957" --- where id = "SCV000079669" --- ; - --- update `clinvar_ingest.clinvar_scvs` cs --- set submitter_id = "26957" --- where id = "SCV000079669" --- ; - --- update `clinvar_2019_06_01_v0.scv_summary` ss --- SET ss.submitter_id = scv.submitter_id --- FROM ( --- SELECT scv1.id, scv1.submitter_id --- FROM `clinvar_ingest.clinvar_scvs` scv1 --- where scv1.submitter_id not in ("500029", "500062") --- and exists --- ( --- select scv2.id from `clinvar_ingest.clinvar_scvs` scv2 --- where scv2.id = scv1.id and scv2.submitter_id in ("500029", "500062") --- ) --- group by scv1.id, scv1.submitter_id --- ) scv --- WHERE ss.submitter_id in ("500029", "500062") and scv.id = ss.id --- ; - --- update `clinvar_ingest.clinvar_scvs` cs --- SET cs.submitter_id = scv.submitter_id --- FROM ( --- SELECT scv1.id, scv1.submitter_id --- FROM `clinvar_ingest.clinvar_scvs` scv1 --- where scv1.submitter_id not in ("500029", "500062") --- and exists --- ( --- select scv2.id from `clinvar_ingest.clinvar_scvs` scv2 --- where scv2.id = scv1.id and scv2.submitter_id in ("500029", "500062") --- ) --- group by scv1.id, scv1.submitter_id --- ) scv --- WHERE cs.submitter_id in ("500029", "500062") and scv.id = cs.id --- ; - --- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.submitter` --- ( --- id STRING, --- current_name STRING, --- current_abbrev STRING, --- org_category STRING, --- all_names ARRAY, --- all_abbrevs ARRAY --- ); - --- INSERT INTO `clinvar_2019_06_01_v0.submitter` --- (id, current_name, org_category) --- VALUES ("500168", "Samuels NHGRI/NIH", "other") --- ; --- INSERT INTO `clinvar_2019_06_01_v0.submitter` --- (id, current_name, org_category) --- VALUES ("500266", "Leeds Institute of Molecular Medicine (LIMM)", "other") --- ; --- INSERT INTO `clinvar_2019_06_01_v0.submitter` --- (id, current_name, org_category) --- VALUES ("505239", "ISCA site 13", "other") - - --- ***************** clinvar_variations ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_variations` -( - id STRING NOT NULL, - name STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - --- housekeeping issue! --- -- remove duplicate variation records by replacing variation view with a table from 2022_07_24 dataset --- -- which contains 107 duplicate variation records! --- DROP VIEW `clinvar_2022_07_24_v1_6_46.variation`; --- CREATE OR REPLACE TABLE `clinvar_2022_07_24_v1_6_46.variation` --- AS --- SELECT --- datarepo_row_id, --- name, --- variation_type, --- allele_id, --- release_date, --- subclass_type, --- protein_change, --- content, --- id, --- descendant_ids, --- num_chromosomes, --- num_copies, --- child_ids --- FROM ( --- SELECT --- *, --- ROW_NUMBER() OVER (PARTITION BY release_date, id) row_number --- FROM `datarepo-550c0177.clinvar_2022_07_24_v1_6_46.variation` --- ) --- WHERE row_number = 1 --- ; - --- -- housekeeping... the clinvar_2019_06_01_v0.variation table can end up with multiple names for the same variation id --- -- to correct this we will simply pick the first one and delete the others before running the script below --- DELETE FROM `clinvar_2019_06_01_v0.variation` v --- WHERE EXISTS ( --- SELECT v2.release_date, v2.id, v2.first_name --- FROM ( --- SELECT release_date, id, ARRAY_AGG(name)[OFFSET(0)] as first_name --- FROM `clinvar_2019_06_01_v0.variation` --- GROUP BY release_date, id --- HAVING count(name) > 1 --- ) v2 --- WHERE v2.release_date = v.release_date AND v2.id = v.id AND v2.first_name <> v.name --- ); - --- ***************** clinvar_vcvs ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_vcvs` -( - variation_id STRING NOT NULL, - id STRING NOT NULL, - version INT64 NOT NULL, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - --- ***************** clinvar_vcv_classifications ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_vcv_classifications` -( - vcv_id STRING NOT NULL, - statement_type STRING NOT NULL, - rank INT64 NOT NULL, - last_evaluated DATE, - agg_classification_description STRING, - num_submitters INT64, - num_submissions INT64, - most_recent_submission DATE, - clinical_impact_assertion_type STRING, - clinical_impact_clinical_significance STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - - --- ***************** clinvar_scvs ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_scvs` -( - variation_id STRING NOT NULL, - id STRING NOT NULL, - version INT64 NOT NULL, - statement_type STRING NOT NULL, - rpt_stmt_type STRING NOT NULL, - rank INT64 NOT NULL, - last_evaluated DATE, - local_key STRING, - classif_type STRING, - clinsig_type INT, - submitted_classification STRING, - submitter_id STRING, - submission_date DATE, - origin STRING, - affected_status STRING, - method_type STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - - --- one variant processing of SCVs per release - --- -- housekeeping, remove any duplicate rows in scv.summary for the snapshot dbs clinvar_2019_06_01_v0, --- -- clinvar_2021_03_02_v1_2_9(SCV001164315), clinvar_2022_07_24_v1_6_46(1,337 duplicates?!) --- CREATE OR REPLACE TABLE `clinvar_2022_07_24_v1_6_46.scv_summary` --- AS --- SELECT --- release_date, --- id, --- version, --- variation_id, --- last_evaluated, --- rank, --- review_status, --- clinvar_stmt_type, --- cvc_stmt_type, --- submitted_classification, --- classif_type, --- significance, --- submitter_id, --- submission_date, --- origin, --- affected_status, --- method_type, --- last_processed_curation_action, --- pending_curation_action --- FROM ( --- SELECT *, ROW_NUMBER() OVER (PARTITION BY release_date, variation_id, id, version) row_number --- FROM `clinvar_2022_07_24_v1_6_46.scv_summary` --- ) --- WHERE row_number = 1 --- ; - --- ***************** clinvar_gc_scvs ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_gc_scvs` -( - variation_id STRING NOT NULL, - id STRING NOT NULL, - version INT64 NOT NULL, - submitter_id STRING, - method_desc STRING, - method_type STRING, - lab_name STRING, - lab_date_reported DATE, - lab_id STRING, - lab_classification STRING, - lab_classif_type STRING, - lab_type STRING, - sample_id STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 DEFAULT 0 -); - - --- ***************** clinvar_var_scv_change ***************** -CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_var_scv_change` -( - variation_id STRING NOT NULL, - start_release_date DATE, - end_release_date DATE -); - - -CREATE OR REPLACE TABLE `clinvar_ingest.voi_group` -( - start_release_date DATE, - end_release_date DATE, - variation_id STRING, - statement_type STRING, - rpt_stmt_type STRING, - rank INT64, - unique_clinsig_type_count INT64, - agg_sig_type INT64, - sig_type ARRAY>, - max_last_evaluated DATE, - max_submission_date DATE, - submission_count INT64, - submitter_count INT64, - agg_classif STRING, - agg_classif_w_count STRING -); - - -CREATE OR REPLACE TABLE `clinvar_ingest.voi_scv` -( - variation_id STRING, - id STRING, - version INT64, - full_scv_id STRING, - statement_type STRING, - rpt_stmt_type STRING, - rank INT64, - last_evaluated DATE, - classif_type STRING, - submitted_classification STRING, - clinsig_type INT64, - classification_label STRING, - classification_abbrev STRING, - submitter_id STRING, - submitter_name STRING, - submitter_abbrev STRING, - submission_date DATE, - origin STRING, - affected_status STRING, - method_type STRING, - start_release_date DATE, - end_release_date DATE, - deleted_release_date DATE, - deleted_count INT64 -); - -CREATE OR REPLACE TABLE `clinvar_ingest.voi_top_group_change` -( - variation_id STRING, - statement_type STRING, - rpt_stmt_type STRING, - top_rank INT64, - start_release_date DATE, - end_release_date DATE -); \ No newline at end of file diff --git a/scripts/temporal-data-collection/03-clinvar-variations-proc.sql b/scripts/temporal-data-collection/03-clinvar-variations-proc.sql index 9efdca6..67da2b8 100644 --- a/scripts/temporal-data-collection/03-clinvar-variations-proc.sql +++ b/scripts/temporal-data-collection/03-clinvar-variations-proc.sql @@ -8,39 +8,51 @@ BEGIN -- deleted variations (where it exists in clinvar_variations (for deleted_release_date is null), but doesn't exist in current data set ) EXECUTE IMMEDIATE FORMAT(""" UPDATE `clinvar_ingest.clinvar_variations` cv - SET deleted_release_date = %T, + SET + deleted_release_date = %T, deleted_count = deleted_count + 1 WHERE cv.deleted_release_date is NULL AND NOT EXISTS ( SELECT v.id FROM `%s.variation` v - WHERE v.id = cv.id + WHERE + v.id = cv.id ) """, release_date, schema_name); -- updated variations EXECUTE IMMEDIATE FORMAT(""" UPDATE `clinvar_ingest.clinvar_variations` cv - SET name = v.name, + SET + name = v.name, end_release_date = v.release_date, deleted_release_date = NULL FROM `%s.variation` v - WHERE v.id = cv.id + WHERE + v.id = cv.id """, schema_name); -- new variations EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `clinvar_ingest.clinvar_variations` - (id, name, start_release_date, end_release_date) - SELECT v.id, v.name, + INSERT INTO `clinvar_ingest.clinvar_variations` ( + id, + name, + start_release_date, + end_release_date + ) + SELECT + v.id, + v.name, v.release_date as start_release_date, v.release_date as end_release_date FROM `%s.variation` v WHERE NOT EXISTS ( - SELECT cv.id + SELECT + cv.id FROM `clinvar_ingest.clinvar_variations` cv - WHERE cv.id = v.id + WHERE + cv.id = v.id ) """, schema_name); diff --git a/scripts/temporal-data-collection/10-clinvar-var-scv-change-proc.sql b/scripts/temporal-data-collection/10-clinvar-var-scv-change-proc.sql deleted file mode 100644 index 6b154f5..0000000 --- a/scripts/temporal-data-collection/10-clinvar-var-scv-change-proc.sql +++ /dev/null @@ -1,63 +0,0 @@ -CREATE OR REPLACE PROCEDURE `clinvar_ingest.clinvar_var_scv_change`() -BEGIN - - CREATE TEMP TABLE _SESSION.release_start_vsc AS - select - st.start_release_date, - st.variation_id, - row_number () over (order by st.variation_id, st.start_release_date asc nulls first) as rownum - from ( - select - start_release_date as start_release_date, - variation_id - from `clinvar_ingest.clinvar_scvs` vs - union distinct - select - MIN(r.release_date) as start_release_date, - vs.variation_id - from `clinvar_ingest.clinvar_scvs` vs - join `clinvar_ingest.clinvar_releases` r - on - r.release_date > vs.end_release_date - group by - vs.end_release_date, - vs.variation_id - ) st; - - CREATE TEMP TABLE _SESSION.release_end_vsc AS - select - en.end_release_date, - en.variation_id, - row_number () over (order by en.variation_id, en.end_release_date asc nulls last) as rownum - from ( - select - end_release_date as end_release_date, - variation_id - from `clinvar_ingest.clinvar_scvs` vs - union distinct - select - MAX(r.release_date) as end_release_date, - vs.variation_id - from `clinvar_ingest.clinvar_scvs` vs - join `clinvar_ingest.clinvar_releases` r - on - r.release_date < vs.start_release_date - group by - vs.start_release_date, - vs.variation_id - ) en; - - CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_var_scv_change` AS - select - e.variation_id, - s.start_release_date, - e.end_release_date - from _SESSION.release_start_vsc s - join _SESSION.release_end_vsc e on e.rownum = s.rownum+1 - where e.variation_id = s.variation_id - ; - - DROP TABLE _SESSION.release_start_vsc; - DROP TABLE _SESSION.release_end_vsc; - -END; diff --git a/scripts/temporal-data-collection/11-voi-vcv-scv-v2-proc.sql b/scripts/temporal-data-collection/11-voi-vcv-scv-v2-proc.sql deleted file mode 100644 index dfe70f1..0000000 --- a/scripts/temporal-data-collection/11-voi-vcv-scv-v2-proc.sql +++ /dev/null @@ -1,83 +0,0 @@ -CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_vcv_scv_v2`() -BEGIN - --- TODO deal with new cols and tables - -- CREATE OR REPLACE TABLE `clinvar_ingest.voi` - -- AS - -- SELECT - -- cv.id AS variation_id, - -- cv.name, - -- csgv.mane_select, - -- csgv.gene_id, - -- cg.symbol, - -- cv.start_release_date, - -- cv.end_release_date, - -- cv.deleted_release_date, - -- cv.deleted_count - -- FROM `clinvar_ingest.clinvar_variations` cv - -- LEFT JOIN `clinvar_ingest.clinvar_single_gene_variations` csgv - -- ON - -- cv.id = csgv.variation_id - -- LEFT JOIN `clinvar_ingest.clinvar_genes` cg - -- ON - -- cg.id = csgv.gene_id - -- ; - - -- CREATE OR REPLACE TABLE `clinvar_ingest.voi_vcv` - -- AS - -- SELECT - -- cv.variation_id, - -- cv.id, - -- cv.version, - -- FORMAT('%s.%i', cv.id, cv.version) as full_vcv_id, - -- cv.rank, - -- cv.last_evaluated, - -- cv.agg_classification, - -- cv.start_release_date, - -- cv.end_release_date, - -- cv.deleted_release_date, - -- cv.deleted_count - -- FROM `clinvar_ingest.clinvar_vcvs` cv; - - -- CREATE OR REPLACE TABLE `clinvar_ingest.voi_scv` - -- AS - -- SELECT - -- cs.variation_id, - -- cs.id, - -- cs.version, - -- FORMAT('%s.%i', cs.id, cs.version) as full_scv_id, - -- cs.rpt_stmt_type, - -- cs.rank, - -- cs.last_evaluated, - -- cs.classif_type, - -- cs.submitted_classification, - -- cs.clinsig_type, - -- FORMAT( '%s, %s, %t', - -- cct.label, - -- if(cs.rank > 0,format("%i%s", cs.rank, CHR(9733)), IF(cs.rank = 0, format("%i%s", cs.rank, CHR(9734)), "n/a")), - -- if(cs.last_evaluated is null, "", format("%t", cs.last_evaluated))) as classification_label, - -- FORMAT( '%s, %s, %t', - -- UPPER(cs.classif_type), - -- if(cs.rank > 0,format("%i%s", cs.rank, CHR(9733)), IF(cs.rank = 0, format("%i%s", cs.rank, CHR(9734)), "n/a")), - -- if(cs.last_evaluated is null, "", format("%t", cs.last_evaluated))) as classification_abbrev, - -- cs.submitter_id, - -- s.current_name as submitter_name, - -- s.cvc_abbrev as submitter_abbrev, - -- cs.submission_date, - -- cs.origin, - -- cs.affected_status, - -- cs.method_type, - -- cs.start_release_date, - -- cs.end_release_date, - -- cs.deleted_release_date, - -- cs.deleted_count - -- FROM `clinvar_ingest.clinvar_scvs` cs - -- LEFT JOIN `clinvar_ingest.clinvar_submitters` s - -- on - -- cs.submitter_id = s.id - -- LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cct - -- on - -- cct.code = cs.classif_type - -- ; - -END; \ No newline at end of file diff --git a/scripts/temporal-data-collection/12-voi-and-voi-scv-group-v2-proc.sql b/scripts/temporal-data-collection/12-voi-and-voi-scv-group-v2-proc.sql deleted file mode 100644 index 30d8404..0000000 --- a/scripts/temporal-data-collection/12-voi-and-voi-scv-group-v2-proc.sql +++ /dev/null @@ -1,120 +0,0 @@ -CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_and_voi_scv_group_v2`() -BEGIN - - -- CREATE OR REPLACE TABLE `clinvar_ingest.voi_group` - -- AS - -- WITH x AS - -- ( - -- SELECT - -- vs.variation_id, - -- vsc.start_release_date, - -- vsc.end_release_date, - -- vs.rpt_stmt_type, - -- vs.rank, - -- vs.clinsig_type, - -- vs.classif_type, - -- (vs.classif_type||'('||count(DISTINCT vs.id)||')') AS classif_type_w_count - -- FROM `clinvar_ingest.voi_scv` vs - -- JOIN `clinvar_ingest.clinvar_var_scv_change` vsc - -- ON - -- vs.variation_id = vsc.variation_id AND - -- (vs.start_release_date <= vsc.end_release_date) AND - -- (vs.end_release_date >= vsc.start_release_date) - -- GROUP BY - -- vs.variation_id, - -- vsc.start_release_date, - -- vsc.end_release_date, - -- vs.rpt_stmt_type, - -- vs.rank, - -- vs.classif_type, - -- vs.clinsig_type - -- ) - -- select - -- x.start_release_date, - -- x.end_release_date, - -- x.variation_id, - -- x.rpt_stmt_type, - -- x.rank, - -- COUNT(DISTINCT vs.clinsig_type) as unique_clinsig_type_count, - -- SUM(DISTINCT IF(vs.clinsig_type=2,4,IF(vs.clinsig_type=1,2,1))) as agg_sig_type, - -- `clinvar_ingest.createSigType`( - -- COUNT(DISTINCT IF(vs.clinsig_type = 0, vs.submitter_id, NULL)), - -- COUNT(DISTINCT IF(vs.clinsig_type = 1, vs.submitter_id, NULL)), - -- COUNT(DISTINCT IF(vs.clinsig_type = 2, vs.submitter_id, NULL)) - -- ) as sig_type, - -- MAX(vs.last_evaluated) as max_last_evaluated, - -- MAX(vs.submission_date) as max_submission_date, - -- count(DISTINCT vs.id) as submission_count, - -- count(DISTINCT vs.submitter_id) as submitter_count, - -- string_agg(distinct x.classif_type, '/' order by x.classif_type) AS agg_classif, - -- string_agg(distinct x.classif_type_w_count, '/' order by x.classif_type_w_count) AS agg_classif_w_count - -- from x - -- JOIN `clinvar_ingest.voi_scv` vs - -- ON - -- vs.variation_id = x.variation_id AND - -- vs.rpt_stmt_type = x.rpt_stmt_type AND - -- vs.rank = x.rank AND - -- (vs.start_release_date <= x.end_release_date) AND - -- (vs.end_release_date >= x.start_release_date) - -- group by - -- x.variation_id, - -- x.start_release_date, - -- x.end_release_date, - -- x.rpt_stmt_type, - -- x.rank - -- ; - - -- -- voi_scv_release_type_rank - -- CREATE OR REPLACE TABLE `clinvar_ingest.voi_scv_group` - -- AS - -- SELECT - -- vg.start_release_date, - -- vg.end_release_date, - -- vs.variation_id, - -- vs.id, - -- vs.version, - -- vg.rpt_stmt_type, - -- vg.rank, - -- vg.sig_type[OFFSET(vs.clinsig_type)].percent as outlier_pct, - -- -- vg.cvc_sig_type[OFFSET(vs.clinsig_type)].percent as cvc_outlier_pct, - -- FORMAT("%s (%s) %3.0f%% %s", - -- IFNULL(vs.submitter_abbrev,LEFT(vs.submitter_name,15)), - -- vs.classification_abbrev, - -- vg.sig_type[OFFSET(vs.clinsig_type)].percent*100, - -- vs.full_scv_id) as scv_label, - -- CASE vs.rpt_stmt_type - -- WHEN 'path' THEN - -- CASE vs.clinsig_type - -- WHEN 2 THEN '1-PLP' - -- WHEN 1 THEN '2-VUS' - -- WHEN 0 THEN '3-BLB' - -- ELSE '5-???' END - -- WHEN 'dr' THEN "4-ADDT'L" - -- ELSE "4-ADDT'L" END as scv_group_type - -- FROM `clinvar_ingest.voi_group` vg - -- JOIN `clinvar_ingest.voi_scv` vs on - -- vg.variation_id = vs.variation_id AND - -- vg.rpt_stmt_type=vs.rpt_stmt_type AND - -- vg.rank = vs.rank AND - -- (vg.start_release_date <= vs.end_release_date) AND - -- (vg.end_release_date >= vs.start_release_date) - -- ; - -END; - - --- find intersection between voi and voi_scv windows for the same variant to create the voi_group records --- date window intersection is found by using the condition ((start_window1 <= end_window2) AND (end_window1 >= start_window2)) --- the start and end dates are always inclusive, meaning the start date is the date that the record is first available and --- the end date is the date that the record is last available. --- https://stackoverflow.com/questions/325933/determine-whether-two-date-ranges-overlap --- (s1 <= eX) AND (e1 >= sX) - --- A s1--------------e1 --- |----|----|----|----|----|----|----| --- B s2------e2 s1 <= e2 AND e1 >= s2. TRUE --- C s3------------------------e3 s1 <= e3 AND e1 >= s3. TRUE --- D s4------e4 s1 <= e4 AND e1 >= s4. TRUE --- E s5----------e5 s1 <= e5 AND e1 >= s5. TRUE --- F s6--e6 s1 <= e6 AND e1 >= s6. FALSE --- G. s7--e7 s1 <= e7 AND e1 >= s7. FALSE \ No newline at end of file diff --git a/scripts/temporal-data-collection/13-voi-group-change-v2-proc.sql b/scripts/temporal-data-collection/13-voi-group-change-v2-proc.sql deleted file mode 100644 index eb6137b..0000000 --- a/scripts/temporal-data-collection/13-voi-group-change-v2-proc.sql +++ /dev/null @@ -1,81 +0,0 @@ --- create or replace voi_group_change_recalc table -CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_group_change_v2`() -BEGIN - - -- CREATE TEMP TABLE _SESSION.release_start_vg AS - -- select - -- st.start_release_date, - -- st.variation_id, - -- st.rpt_stmt_type, - -- st.rank, - -- row_number () over (order by st.variation_id, st.rpt_stmt_type, st.rank, st.start_release_date asc nulls first) as rownum - -- from ( - -- select - -- vg.start_release_date, - -- vg.variation_id, - -- vg.rpt_stmt_type, - -- vg.rank - -- from `clinvar_ingest.voi_group` vg - -- UNION DISTINCT - -- select - -- MIN(r.release_date) as start_release_date, - -- variation_id, - -- rpt_stmt_type, - -- rank - -- from `clinvar_ingest.voi_group` vg - -- left join `clinvar_ingest.clinvar_releases` r - -- on - -- r.release_date > vg.end_release_date - -- group by - -- vg.end_release_date, - -- vg.variation_id, - -- vg.rpt_stmt_type, - -- vg.rank - -- ) st; - - -- CREATE TEMP TABLE _SESSION.release_end_vg AS - -- select - -- en.end_release_date, - -- en.variation_id, - -- en.rpt_stmt_type, - -- en.rank, - -- row_number () over (order by en.variation_id, en.rpt_stmt_type, en.rank, en.end_release_date asc nulls last) as rownum - -- from ( - -- select - -- end_release_date, - -- variation_id, - -- rpt_stmt_type, - -- rank - -- from `clinvar_ingest.voi_group` vg - -- UNION DISTINCT - -- select - -- MAX(r.release_date) as end_release_date, - -- variation_id, - -- rpt_stmt_type, - -- rank - -- from `clinvar_ingest.voi_group` vg - -- left join `clinvar_ingest.clinvar_releases` r - -- on - -- r.release_date < vg.start_release_date - -- group by - -- vg.start_release_date, - -- vg.variation_id, - -- vg.rpt_stmt_type, - -- vg.rank - -- ) en; - - -- CREATE OR REPLACE TABLE `clinvar_ingest.voi_group_change` AS - -- select - -- e.variation_id, - -- e.rpt_stmt_type, - -- e.rank, - -- s.start_release_date, - -- e.end_release_date - -- from _SESSION.release_start_vg s - -- join _SESSION.release_end_vg e on e.rownum = s.rownum + 1 - -- where e.variation_id = s.variation_id; - - -- DROP TABLE _SESSION.release_start_vg; - -- DROP TABLE _SESSION.release_end_vg; - -END; diff --git a/scripts/temporal-data-collection/14-voi-top-group-change-v2-proc.sql b/scripts/temporal-data-collection/14-voi-top-group-change-v2-proc.sql deleted file mode 100644 index fa7475d..0000000 --- a/scripts/temporal-data-collection/14-voi-top-group-change-v2-proc.sql +++ /dev/null @@ -1,106 +0,0 @@ --- create or replace voi_top_group_change -CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_top_group_change_v2`() -BEGIN - - -- CREATE TEMP TABLE _SESSION.voi_top_group AS - -- WITH - -- x AS ( - -- SELECT - -- variation_id, - -- rpt_stmt_type, - -- start_release_date, - -- end_release_date, - -- MAX(rank) AS top_rank - -- FROM - -- `clinvar_ingest.voi_group` vg - -- GROUP BY - -- variation_id, - -- start_release_date, - -- end_release_date, - -- rpt_stmt_type) - -- SELECT - -- x.start_release_date, - -- x.end_release_date, - -- x.variation_id, - -- x.rpt_stmt_type, - -- x.top_rank - -- FROM x; - - -- CREATE TEMP TABLE _SESSION.release_start_tg AS - -- select - -- st.start_release_date, - -- st.variation_id, - -- st.rpt_stmt_type, - -- st.top_rank, - -- row_number () over (order by st.variation_id, st.rpt_stmt_type, st.start_release_date asc nulls first) as rownum - -- from ( - -- select - -- vtg.start_release_date, - -- vtg.variation_id, - -- vtg.rpt_stmt_type, - -- vtg.top_rank - -- from _SESSION.voi_top_group vtg - -- UNION DISTINCT - -- select - -- MIN(r.release_date) as start_release_date, - -- vtg.variation_id, - -- vtg.rpt_stmt_type, - -- vtg.top_rank - -- from _SESSION.voi_top_group vtg - -- left join `clinvar_ingest.clinvar_releases` r - -- on - -- r.release_date > vtg.end_release_date - -- group by - -- vtg.end_release_date, - -- vtg.variation_id, - -- vtg.rpt_stmt_type, - -- vtg.top_rank - -- ) st; - - -- CREATE TEMP TABLE _SESSION.release_end_tg AS - -- select - -- en.end_release_date, - -- en.variation_id, - -- en.rpt_stmt_type, - -- en.top_rank, - -- row_number () over (order by en.variation_id, en.rpt_stmt_type, en.end_release_date asc nulls last) as rownum - -- from ( - -- select - -- vtg.end_release_date, - -- vtg.variation_id, - -- vtg.rpt_stmt_type, - -- vtg.top_rank - -- from _SESSION.voi_top_group vtg - -- UNION DISTINCT - -- select - -- MAX(r.release_date) as end_release_date, - -- vtg.variation_id, - -- vtg.rpt_stmt_type, - -- vtg.top_rank - -- from _SESSION.voi_top_group vtg - -- left join `clinvar_ingest.clinvar_releases` r - -- on - -- r.release_date < vtg.start_release_date - -- group by - -- vtg.start_release_date, - -- vtg.variation_id, - -- vtg.rpt_stmt_type, - -- vtg.top_rank - -- ) en; - - -- CREATE OR REPLACE TABLE `clinvar_ingest.voi_top_group_change` AS - -- select - -- e.variation_id, - -- e.rpt_stmt_type, - -- e.top_rank, - -- s.start_release_date, - -- e.end_release_date - -- from _SESSION.release_start_tg s - -- join _SESSION.release_end_tg e on e.rownum = s.rownum + 1 - -- where e.variation_id = s.variation_id; - - -- DROP TABLE _SESSION.voi_top_group; - -- DROP TABLE _SESSION.release_start_tg; - -- DROP TABLE _SESSION.release_end_tg; - -END; \ No newline at end of file diff --git a/scripts/temporal-data-collection/15-voi-summary-change-proc.sql b/scripts/temporal-data-collection/15-voi-summary-change-proc.sql deleted file mode 100644 index e8c3863..0000000 --- a/scripts/temporal-data-collection/15-voi-summary-change-proc.sql +++ /dev/null @@ -1,77 +0,0 @@ --- calc voi summary -CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_summary_change`() -BEGIN - - CREATE TEMP TABLE _SESSION.voi_summary AS - SELECT - variation_id, - start_release_date, - end_release_date - FROM - `clinvar_ingest.voi_top_group_change` vtg - GROUP BY - variation_id, - start_release_date, - end_release_date; - - CREATE TEMP TABLE _SESSION.release_start_vs AS - select - st.start_release_date, - st.variation_id, - row_number () over (order by st.variation_id, st.start_release_date asc nulls first) as rownum - from ( - select - vs.start_release_date, - vs.variation_id - from _SESSION.voi_summary vs - UNION DISTINCT - select - MIN(r.release_date) as start_release_date, - vs.variation_id - from _SESSION.voi_summary vs - left join `clinvar_ingest.clinvar_releases` r - on - r.release_date > vs.end_release_date - group by - vs.end_release_date, - vs.variation_id - ) st; - - CREATE TEMP TABLE _SESSION.release_end_vs AS - select - en.end_release_date, - en.variation_id, - row_number () over (order by en.variation_id, en.end_release_date asc nulls last) as rownum - from ( - select - vs.end_release_date, - vs.variation_id - from _SESSION.voi_summary vs - UNION DISTINCT - select - MAX(r.release_date) as end_release_date, - vs.variation_id - from _SESSION.voi_summary vs - left join `clinvar_ingest.clinvar_releases` r - on - r.release_date < vs.start_release_date - group by - vs.start_release_date, - vs.variation_id - ) en; - - - CREATE OR REPLACE TABLE `clinvar_ingest.voi_summary_change` AS - select - e.variation_id, - s.start_release_date, - e.end_release_date - from _SESSION.release_start_vs s - join _SESSION.release_end_vs e on e.rownum = s.rownum + 1 - where e.variation_id = s.variation_id; - - DROP TABLE _SESSION.voi_summary; - DROP TABLE _SESSION.release_start_vs; - DROP TABLE _SESSION.release_end_vs; - -END; \ No newline at end of file diff --git a/scripts/temporal-data-collection/15-voi-summary-change-v2-proc.sql b/scripts/temporal-data-collection/15-voi-summary-change-v2-proc.sql deleted file mode 100644 index 691a219..0000000 --- a/scripts/temporal-data-collection/15-voi-summary-change-v2-proc.sql +++ /dev/null @@ -1,77 +0,0 @@ --- calc voi summary -CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_summary_change_v2`() -BEGIN - - CREATE TEMP TABLE _SESSION.voi_summary AS - SELECT - variation_id, - start_release_date, - end_release_date - FROM - `clinvar_ingest.voi_top_group_change` vtg - GROUP BY - variation_id, - start_release_date, - end_release_date; - - CREATE TEMP TABLE _SESSION.release_start_vs AS - select - st.start_release_date, - st.variation_id, - row_number () over (order by st.variation_id, st.start_release_date asc nulls first) as rownum - from ( - select - vs.start_release_date, - vs.variation_id - from _SESSION.voi_summary vs - UNION DISTINCT - select - MIN(r.release_date) as start_release_date, - vs.variation_id - from _SESSION.voi_summary vs - left join `clinvar_ingest.clinvar_releases` r - on - r.release_date > vs.end_release_date - group by - vs.end_release_date, - vs.variation_id - ) st; - - CREATE TEMP TABLE _SESSION.release_end_vs AS - select - en.end_release_date, - en.variation_id, - row_number () over (order by en.variation_id, en.end_release_date asc nulls last) as rownum - from ( - select - vs.end_release_date, - vs.variation_id - from _SESSION.voi_summary vs - UNION DISTINCT - select - MAX(r.release_date) as end_release_date, - vs.variation_id - from _SESSION.voi_summary vs - left join `clinvar_ingest.clinvar_releases` r - on - r.release_date < vs.start_release_date - group by - vs.start_release_date, - vs.variation_id - ) en; - - - CREATE OR REPLACE TABLE `clinvar_ingest.voi_summary_change` AS - select - e.variation_id, - s.start_release_date, - e.end_release_date - from _SESSION.release_start_vs s - join _SESSION.release_end_vs e on e.rownum = s.rownum + 1 - where e.variation_id = s.variation_id; - - DROP TABLE _SESSION.voi_summary; - DROP TABLE _SESSION.release_start_vs; - DROP TABLE _SESSION.release_end_vs; - -END; \ No newline at end of file diff --git a/scripts/temporal-data-collection/temporal-data-collection-proc.sql b/scripts/temporal-data-collection/temporal-data-collection-proc.sql index 373c9fa..13c89f3 100644 --- a/scripts/temporal-data-collection/temporal-data-collection-proc.sql +++ b/scripts/temporal-data-collection/temporal-data-collection-proc.sql @@ -11,19 +11,11 @@ BEGIN FROM clinvar_ingest.schema_on(on_date) as s ) DO - CALL `clinvar_ingest.clinvar_genes`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_submitters`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_variations`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_vcvs`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_scvs`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_gc_scvs`(rec.schema_name, rec.release_date); - - CALL `clinvar_ingest.clinvar_var_scv_change`(); - CALL `clinvar_ingest.voi_vcv_scv`(); - CALL `clinvar_ingest.voi_and_voi_scv_group`(); - CALL `clinvar_ingest.voi_group_change`(); - CALL `clinvar_ingest.voi_top_group_change`(); - CALL `clinvar_ingest.voi_summary_change_proc`(); + CALL `clinvar_ingest.clinvar_genes`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_submitters`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_variations`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_vcvs`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_scvs`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_gc_scvs`(rec.schema_name, rec.release_date, rec.prev_release_date); END FOR; - END; \ No newline at end of file diff --git a/scripts/temporal-data-collection/temporal-data-collection-v2-proc.sql b/scripts/temporal-data-collection/temporal-data-collection-v2-proc.sql index 3c719f4..9f4e6ef 100644 --- a/scripts/temporal-data-collection/temporal-data-collection-v2-proc.sql +++ b/scripts/temporal-data-collection/temporal-data-collection-v2-proc.sql @@ -1,6 +1,4 @@ -CREATE OR REPLACE PROCEDURE `clinvar_ingest.temporal_data_collection_v2`( - on_date DATE -) +CREATE OR REPLACE PROCEDURE `clingen-dev.clinvar_ingest.temporal_data_collection_v2`(on_date DATE) BEGIN FOR rec IN ( select @@ -11,19 +9,11 @@ BEGIN FROM clinvar_ingest.schema_on_v2(on_date) as s ) DO - CALL `clinvar_ingest.clinvar_genes`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_submitters`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_variations`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_vcvs_v2`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_scvs_v2`(rec.schema_name, rec.release_date); - CALL `clinvar_ingest.clinvar_gc_scvs`(rec.schema_name, rec.release_date); - - CALL `clinvar_ingest.clinvar_var_scv_change`(); - CALL `clinvar_ingest.voi_vcv_scv_v2`(); - CALL `clinvar_ingest.voi_and_voi_scv_group_v2`(); - CALL `clinvar_ingest.voi_group_change_v2`(); - CALL `clinvar_ingest.voi_top_group_change_v2`(); - CALL `clinvar_ingest.voi_summary_change_proc_v2`(); + CALL `clinvar_ingest.clinvar_genes`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_submitters`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_variations`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_vcvs_v2`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_scvs_v2`(rec.schema_name, rec.release_date, rec.prev_release_date); + CALL `clinvar_ingest.clinvar_gc_scvs`(rec.schema_name, rec.release_date, rec.prev_release_date); END FOR; - END; \ No newline at end of file diff --git a/scripts/temporal-data-summation/01-clinvar-var-scv-change-proc.sql b/scripts/temporal-data-summation/01-clinvar-var-scv-change-proc.sql new file mode 100644 index 0000000..a3d3d54 --- /dev/null +++ b/scripts/temporal-data-summation/01-clinvar-var-scv-change-proc.sql @@ -0,0 +1,77 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.clinvar_var_scv_change`() +BEGIN + + CREATE TEMP TABLE _SESSION.release_start_vsc + AS + SELECT + st.start_release_date, + st.variation_id, + row_number () OVER ( + ORDER BY + st.variation_id, + st.start_release_date ASC NULLS FIRST + ) as rownum + FROM ( + SELECT + start_release_date as start_release_date, + variation_id + FROM `clinvar_ingest.clinvar_scvs` vs + UNION DISTINCT + SELECT + MIN(r.release_date) as start_release_date, + vs.variation_id + FROM `clinvar_ingest.clinvar_scvs` vs + JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date > vs.end_release_date + GROUP BY + vs.end_release_date, + vs.variation_id + ) st; + + CREATE TEMP TABLE _SESSION.release_end_vsc + AS + SELECT + en.end_release_date, + en.variation_id, + row_number () OVER ( + ORDER BY + en.variation_id, + en.end_release_date ASC NULLS LAST + ) as rownum + FROM ( + SELECT + end_release_date as end_release_date, + variation_id + FROM `clinvar_ingest.clinvar_scvs` vs + UNION DISTINCT + SELECT + MAX(r.release_date) as end_release_date, + vs.variation_id + FROM `clinvar_ingest.clinvar_scvs` vs + JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date < vs.start_release_date + GROUP BY + vs.start_release_date, + vs.variation_id + ) en; + + CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_var_scv_change` + AS + SELECT + e.variation_id, + s.start_release_date, + e.end_release_date + FROM _SESSION.release_start_vsc s + JOIN _SESSION.release_end_vsc e + ON + e.rownum = s.rownum+1 + WHERE + e.variation_id = s.variation_id + ; + + DROP TABLE _SESSION.release_start_vsc; + DROP TABLE _SESSION.release_end_vsc; + +END; diff --git a/scripts/temporal-data-collection/11-voi-vcv-scv-proc.sql b/scripts/temporal-data-summation/02-voi-vcv-scv-proc.sql similarity index 99% rename from scripts/temporal-data-collection/11-voi-vcv-scv-proc.sql rename to scripts/temporal-data-summation/02-voi-vcv-scv-proc.sql index 9d40c06..1d43bcf 100644 --- a/scripts/temporal-data-collection/11-voi-vcv-scv-proc.sql +++ b/scripts/temporal-data-summation/02-voi-vcv-scv-proc.sql @@ -72,10 +72,10 @@ BEGIN cs.deleted_count FROM `clinvar_ingest.clinvar_scvs` cs LEFT JOIN `clinvar_ingest.clinvar_submitters` s - on + ON cs.submitter_id = s.id LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cct - on + ON cct.code = cs.classif_type ; diff --git a/scripts/temporal-data-summation/02-voi-vcv-scv-v2-proc.sql b/scripts/temporal-data-summation/02-voi-vcv-scv-v2-proc.sql new file mode 100644 index 0000000..91f22e3 --- /dev/null +++ b/scripts/temporal-data-summation/02-voi-vcv-scv-v2-proc.sql @@ -0,0 +1,97 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_vcv_scv_v2`() +BEGIN + + CREATE OR REPLACE TABLE `clinvar_ingest.voi` + AS + SELECT + cv.id AS variation_id, + cv.name, + csgv.mane_select, + csgv.gene_id, + cg.symbol, + cv.start_release_date, + cv.end_release_date, + cv.deleted_release_date, + cv.deleted_count + FROM `clinvar_ingest.clinvar_variations` cv + LEFT JOIN `clinvar_ingest.clinvar_single_gene_variations` csgv + ON + cv.id = csgv.variation_id + LEFT JOIN `clinvar_ingest.clinvar_genes` cg + ON + cg.id = csgv.gene_id + ; + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_vcv` + AS + SELECT + cv.variation_id, + cv.id, + cv.version, + FORMAT('%s.%i', cv.id, cv.version) as full_vcv_id, + cv.start_release_date, + cv.end_release_date, + cv.deleted_release_date, + cv.deleted_count + FROM `clinvar_ingest.clinvar_vcvs` cv; + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_vcv_classification` + AS + SELECT + cvc.vcv_id, + cvc.rank, + cvc.last_evaluated, + cvc.agg_classification_description, + cvc.clinical_impact_assertion_type, + cvc.clinical_impact_clinical_significance, + cvc.start_release_date, + cvc.end_release_date, + cvc.deleted_release_date, + cvc.deleted_count + FROM `clinvar_ingest.clinvar_vcv_classifications` cvc; + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_scv` + AS + SELECT + cs.variation_id, + cs.id, + cs.version, + FORMAT('%s.%i', cs.id, cs.version) as full_scv_id, + cs.statement_type, + cs.gks_proposition_type, + cs.clinical_impact_assertion_type, + cs.clinical_impact_clinical_significance, + cs.rank, + cs.last_evaluated, + cs.classif_type, + cs.submitted_classification, + cs.clinsig_type, + FORMAT( '%s, %s, %t', + cct.label, + if(cs.rank > 0,format("%i%s", cs.rank, CHR(9733)), IF(cs.rank = 0, format("%i%s", cs.rank, CHR(9734)), "n/a")), + if(cs.last_evaluated is null, "", format("%t", cs.last_evaluated))) as classification_label, + FORMAT( '%s, %s, %t', + UPPER(cs.classif_type), + if(cs.rank > 0,format("%i%s", cs.rank, CHR(9733)), IF(cs.rank = 0, format("%i%s", cs.rank, CHR(9734)), "n/a")), + if(cs.last_evaluated is null, "", format("%t", cs.last_evaluated))) as classification_abbrev, + cs.submitter_id, + s.current_name as submitter_name, + s.cvc_abbrev as submitter_abbrev, + cs.submission_date, + cs.origin, + cs.affected_status, + cs.method_type, + cs.start_release_date, + cs.end_release_date, + cs.deleted_release_date, + cs.deleted_count + FROM `clinvar_ingest.clinvar_scvs` cs + LEFT JOIN `clinvar_ingest.clinvar_submitters` s + ON + cs.submitter_id = s.id + LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cct + ON + cct.code = cs.classif_type + ; + +END; \ No newline at end of file diff --git a/scripts/temporal-data-collection/12-voi-and-voi-scv-group-proc.sql b/scripts/temporal-data-summation/03-voi-and-voi-scv-group-proc.sql similarity index 75% rename from scripts/temporal-data-collection/12-voi-and-voi-scv-group-proc.sql rename to scripts/temporal-data-summation/03-voi-and-voi-scv-group-proc.sql index 95fff46..c7b6349 100644 --- a/scripts/temporal-data-collection/12-voi-and-voi-scv-group-proc.sql +++ b/scripts/temporal-data-summation/03-voi-and-voi-scv-group-proc.sql @@ -3,8 +3,7 @@ BEGIN CREATE OR REPLACE TABLE `clinvar_ingest.voi_group` AS - WITH x AS - ( + WITH x AS ( SELECT vs.variation_id, vsc.start_release_date, @@ -13,13 +12,15 @@ BEGIN vs.rank, vs.clinsig_type, vs.classif_type, - (vs.classif_type||'('||count(DISTINCT vs.id)||')') AS classif_type_w_count + (vs.classif_type||'('||COUNT(DISTINCT vs.id)||')') AS classif_type_w_count FROM `clinvar_ingest.voi_scv` vs JOIN `clinvar_ingest.clinvar_var_scv_change` vsc ON - vs.variation_id = vsc.variation_id AND - (vs.start_release_date <= vsc.end_release_date) AND - (vs.end_release_date >= vsc.start_release_date) + vs.variation_id = vsc.variation_id + AND + vs.start_release_date <= vsc.end_release_date + AND + vs.end_release_date >= vsc.start_release_date GROUP BY vs.variation_id, vsc.start_release_date, @@ -29,7 +30,7 @@ BEGIN vs.classif_type, vs.clinsig_type ) - select + SELECT x.start_release_date, x.end_release_date, x.variation_id, @@ -44,19 +45,23 @@ BEGIN ) as sig_type, MAX(vs.last_evaluated) as max_last_evaluated, MAX(vs.submission_date) as max_submission_date, - count(DISTINCT vs.id) as submission_count, - count(DISTINCT vs.submitter_id) as submitter_count, - string_agg(distinct x.classif_type, '/' order by x.classif_type) AS agg_classif, - string_agg(distinct x.classif_type_w_count, '/' order by x.classif_type_w_count) AS agg_classif_w_count - from x + COUNT(DISTINCT vs.id) as submission_count, + COUNT(DISTINCT vs.submitter_id) as submitter_count, + STRING_AGG(DISTINCT x.classif_type, '/' ORDER BY x.classif_type) AS agg_classif, + STRING_AGG(DISTINCT x.classif_type_w_count, '/' ORDER BY x.classif_type_w_count) AS agg_classif_w_count + FROM x JOIN `clinvar_ingest.voi_scv` vs ON - vs.variation_id = x.variation_id AND - vs.rpt_stmt_type = x.rpt_stmt_type AND - vs.rank = x.rank AND - (vs.start_release_date <= x.end_release_date) AND - (vs.end_release_date >= x.start_release_date) - group by + vs.variation_id = x.variation_id + AND + vs.rpt_stmt_type IS NOT DISTINCT FROM x.rpt_stmt_type + AND + vs.rank IS NOT DISTINCT FROM x.rank + AND + vs.start_release_date <= x.end_release_date + AND + vs.end_release_date >= x.start_release_date + GROUP BY x.variation_id, x.start_release_date, x.end_release_date, @@ -92,12 +97,17 @@ BEGIN WHEN 'dr' THEN "4-ADDT'L" ELSE "4-ADDT'L" END as scv_group_type FROM `clinvar_ingest.voi_group` vg - JOIN `clinvar_ingest.voi_scv` vs on - vg.variation_id = vs.variation_id AND - vg.rpt_stmt_type=vs.rpt_stmt_type AND - vg.rank = vs.rank AND - (vg.start_release_date <= vs.end_release_date) AND - (vg.end_release_date >= vs.start_release_date) + JOIN `clinvar_ingest.voi_scv` vs + ON + vg.variation_id = vs.variation_id + AND + vg.rpt_stmt_type IS NOT DISTINCT FROM vs.rpt_stmt_type + AND + vg.rank IS NOT DISTINCT FROM vs.rank + AND + vg.start_release_date <= vs.end_release_date + AND + vg.end_release_date >= vs.start_release_date ; END; diff --git a/scripts/temporal-data-summation/03-voi-and-voi-scv-group-v2-proc.sql b/scripts/temporal-data-summation/03-voi-and-voi-scv-group-v2-proc.sql new file mode 100644 index 0000000..3bb5df5 --- /dev/null +++ b/scripts/temporal-data-summation/03-voi-and-voi-scv-group-v2-proc.sql @@ -0,0 +1,158 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_and_voi_scv_group_v2`() +BEGIN + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_group` + AS + WITH x AS + ( + SELECT + vs.variation_id, + vsc.start_release_date, + vsc.end_release_date, + vs.statement_type, + vs.gks_proposition_type, + vs.clinical_impact_assertion_type, + vs.clinical_impact_clinical_significance, + vs.rank, + vs.clinsig_type, + vs.classif_type, + (vs.classif_type||'('||COUNT(DISTINCT vs.id)||')') AS classif_type_w_count + FROM `clinvar_ingest.voi_scv` vs + JOIN `clinvar_ingest.clinvar_var_scv_change` vsc + ON + vs.variation_id = vsc.variation_id + AND + vs.start_release_date <= vsc.end_release_date + AND + vs.end_release_date >= vsc.start_release_date + GROUP BY + vs.variation_id, + vsc.start_release_date, + vsc.end_release_date, + vs.statement_type, + vs.gks_proposition_type, + vs.clinical_impact_assertion_type, + vs.clinical_impact_clinical_significance, + vs.rank, + vs.classif_type, + vs.clinsig_type + ) + select + x.start_release_date, + x.end_release_date, + x.variation_id, + x.statement_type, + x.gks_proposition_type, + x.clinical_impact_assertion_type, + x.clinical_impact_clinical_significance, + x.rank, + COUNT(DISTINCT vs.clinsig_type) as unique_clinsig_type_count, + SUM(DISTINCT IF(vs.clinsig_type=2,4,IF(vs.clinsig_type=1,2,1))) as agg_sig_type, + `clinvar_ingest.createSigType`( + COUNT(DISTINCT IF(vs.clinsig_type = 0, vs.submitter_id, NULL)), + COUNT(DISTINCT IF(vs.clinsig_type = 1, vs.submitter_id, NULL)), + COUNT(DISTINCT IF(vs.clinsig_type = 2, vs.submitter_id, NULL)) + ) as sig_type, + MAX(vs.last_evaluated) as max_last_evaluated, + MAX(vs.submission_date) as max_submission_date, + COUNT(DISTINCT vs.id) as submission_count, + COUNT(DISTINCT vs.submitter_id) as submitter_count, + STRING_AGG(DISTINCT x.classif_type, '/' ORDER BY x.classif_type) AS agg_classif, + STRING_AGG(DISTINCT x.classif_type_w_count, '/' ORDER BY x.classif_type_w_count) AS agg_classif_w_count + from x + JOIN `clinvar_ingest.voi_scv` vs + ON + vs.variation_id = x.variation_id + AND + vs.statement_type IS NOT DISTINCT FROM x.statement_type + AND + vs.gks_proposition_type IS NOT DISTINCT FROM x.gks_proposition_type + AND + vs.clinical_impact_assertion_type IS NOT DISTINCT FROM x.clinical_impact_assertion_type + AND + vs.clinical_impact_clinical_significance IS NOT DISTINCT FROM x.clinical_impact_clinical_significance + AND + vs.rank IS NOT DISTINCT FROM x.rank + AND + vs.start_release_date <= x.end_release_date) + AND + vs.end_release_date >= x.start_release_date + GROUP BY + x.variation_id, + x.start_release_date, + x.end_release_date, + x.statement_type, + x.gks_proposition_type, + x.clinical_impact_assertion_type, + x.clinical_impact_clinical_significance, + x.rank + ; + + -- voi_scv_release_type_rank + CREATE OR REPLACE TABLE `clinvar_ingest.voi_scv_group` + AS + SELECT + vg.start_release_date, + vg.end_release_date, + vs.variation_id, + vs.id, + vs.version, + vs.statement_type, + vs.gks_proposition_type, + vs.clinical_impact_assertion_type, + vs.clinical_impact_clinical_significance, + vg.rank, + vg.sig_type[OFFSET(vs.clinsig_type)].percent as outlier_pct, + -- vg.cvc_sig_type[OFFSET(vs.clinsig_type)].percent as cvc_outlier_pct, + FORMAT("%s (%s) %3.0f%% %s", + IFNULL(vs.submitter_abbrev,LEFT(vs.submitter_name,15)), + vs.classification_abbrev, + vg.sig_type[OFFSET(vs.clinsig_type)].percent*100, + vs.full_scv_id) as scv_label, + CASE vs.rpt_stmt_type + WHEN 'path' THEN + CASE vs.clinsig_type + WHEN 2 THEN '1-PLP' + WHEN 1 THEN '2-VUS' + WHEN 0 THEN '3-BLB' + ELSE '5-???' END + WHEN 'dr' THEN "4-ADDT'L" + ELSE "4-ADDT'L" END as scv_group_type + FROM `clinvar_ingest.voi_group` vg + JOIN `clinvar_ingest.voi_scv` vs + ON + vg.variation_id = vs.variation_id + AND + vg.statement_type IS NOT DISTINCT FROM vs.statement_type + AND + vg.gks_proposition_type IS NOT DISTINCT FROM vs.gks_proposition_type + AND + vg.clinical_impact_assertion_type IS NOT DISTINCT FROM vs.clinical_impact_assertion_type + AND + vg.clinical_impact_clinical_significance IS NOT DISTINCT FROM vs.clinical_impact_clinical_significance + AND + vg.rank IS NOT DISTINCT FROM vs.rank + AND + vg.start_release_date <= vs.end_release_date + AND + vg.end_release_date >= vs.start_release_date + ; + +END; + + +-- find intersection between voi and voi_scv windows for the same variant to create the voi_group records +-- date window intersection is found by using the condition ((start_window1 <= end_window2) AND (end_window1 >= start_window2)) +-- the start and end dates are always inclusive, meaning the start date is the date that the record is first available and +-- the end date is the date that the record is last available. +-- https://stackoverflow.com/questions/325933/determine-whether-two-date-ranges-overlap +-- (s1 <= eX) AND (e1 >= sX) + +-- A s1--------------e1 +-- |----|----|----|----|----|----|----| +-- B s2------e2 s1 <= e2 AND e1 >= s2. TRUE +-- C s3------------------------e3 s1 <= e3 AND e1 >= s3. TRUE +-- D s4------e4 s1 <= e4 AND e1 >= s4. TRUE +-- E s5----------e5 s1 <= e5 AND e1 >= s5. TRUE +-- F s6--e6 s1 <= e6 AND e1 >= s6. FALSE +-- G. s7--e7 s1 <= e7 AND e1 >= s7. FALSE \ No newline at end of file diff --git a/scripts/temporal-data-collection/13-voi-group-change-proc.sql b/scripts/temporal-data-summation/04-voi-group-change-proc.sql similarity index 52% rename from scripts/temporal-data-collection/13-voi-group-change-proc.sql rename to scripts/temporal-data-summation/04-voi-group-change-proc.sql index b36acb4..07da127 100644 --- a/scripts/temporal-data-collection/13-voi-group-change-proc.sql +++ b/scripts/temporal-data-summation/04-voi-group-change-proc.sql @@ -2,78 +2,96 @@ CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_group_change`() BEGIN - CREATE TEMP TABLE _SESSION.release_start_vg AS - select + CREATE TEMP TABLE _SESSION.release_start_vg + AS + SELECT st.start_release_date, st.variation_id, st.rpt_stmt_type, st.rank, - row_number () over (order by st.variation_id, st.rpt_stmt_type, st.rank, st.start_release_date asc nulls first) as rownum - from ( - select + row_number () OVER ( + ORDER BY + st.variation_id, + st.rpt_stmt_type, + st.rank, + st.start_release_date ASC NULLS FIRST + ) as rownum + FROM ( + SELECT vg.start_release_date, vg.variation_id, vg.rpt_stmt_type, vg.rank - from `clinvar_ingest.voi_group` vg + FROM `clinvar_ingest.voi_group` vg UNION DISTINCT - select + SELECT MIN(r.release_date) as start_release_date, variation_id, rpt_stmt_type, rank - from `clinvar_ingest.voi_group` vg - left join `clinvar_ingest.clinvar_releases` r - on + FROM `clinvar_ingest.voi_group` vg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON r.release_date > vg.end_release_date - group by + GROUP BY vg.end_release_date, vg.variation_id, vg.rpt_stmt_type, vg.rank ) st; - CREATE TEMP TABLE _SESSION.release_end_vg AS - select + CREATE TEMP TABLE _SESSION.release_end_vg + AS + SELECT en.end_release_date, en.variation_id, en.rpt_stmt_type, en.rank, - row_number () over (order by en.variation_id, en.rpt_stmt_type, en.rank, en.end_release_date asc nulls last) as rownum - from ( - select + row_number () OVER ( + ORDER BY + en.variation_id, + en.rpt_stmt_type, + en.rank, + en.end_release_date ASC NULLS LAST + ) as rownum + FROM ( + SELECT end_release_date, variation_id, rpt_stmt_type, rank - from `clinvar_ingest.voi_group` vg + FROM `clinvar_ingest.voi_group` vg UNION DISTINCT - select + SELECT MAX(r.release_date) as end_release_date, variation_id, rpt_stmt_type, rank - from `clinvar_ingest.voi_group` vg - left join `clinvar_ingest.clinvar_releases` r - on + FROM `clinvar_ingest.voi_group` vg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON r.release_date < vg.start_release_date - group by + GROUP BY vg.start_release_date, vg.variation_id, vg.rpt_stmt_type, vg.rank ) en; - CREATE OR REPLACE TABLE `clinvar_ingest.voi_group_change` AS - select + CREATE OR REPLACE TABLE `clinvar_ingest.voi_group_change` + AS + SELECT e.variation_id, e.rpt_stmt_type, e.rank, s.start_release_date, e.end_release_date - from _SESSION.release_start_vg s - join _SESSION.release_end_vg e on e.rownum = s.rownum + 1 - where e.variation_id = s.variation_id; + FROM _SESSION.release_start_vg s + JOIN _SESSION.release_end_vg e + ON + e.rownum = s.rownum + 1 + WHERE + e.variation_id = s.variation_id; DROP TABLE _SESSION.release_start_vg; DROP TABLE _SESSION.release_end_vg; diff --git a/scripts/temporal-data-summation/04-voi-group-change-v2-proc.sql b/scripts/temporal-data-summation/04-voi-group-change-v2-proc.sql new file mode 100644 index 0000000..c558484 --- /dev/null +++ b/scripts/temporal-data-summation/04-voi-group-change-v2-proc.sql @@ -0,0 +1,126 @@ +-- create or replace voi_group_change_recalc table +CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_group_change_v2`() +BEGIN + + CREATE TEMP TABLE _SESSION.release_start_vg + AS + SELECT + st.start_release_date, + st.variation_id, + st.statement_type, + st.gks_proposition_type, + st.clinical_impact_assertion_type, + st.clinical_impact_clinical_significance, + st.rank, + row_number () OVER ( + ORDER BY + st.variation_id, + st.rpt_stmt_type, + st.rank, + st.start_release_date ASC NULLS FIRST + ) as rownum + FROM ( + SELECT + vg.start_release_date, + vg.variation_id, + vg.statement_type, + vg.gks_proposition_type, + vg.clinical_impact_assertion_type, + vg.clinical_impact_clinical_significance, + vg.rank + FROM `clinvar_ingest.voi_group` vg + UNION DISTINCT + SELECT + MIN(r.release_date) as start_release_date, + variation_id, + rpt_stmt_type, + rank + FROM `clinvar_ingest.voi_group` vg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date > vg.end_release_date + GROUP BY + vg.end_release_date, + vg.variation_id, + vg.statement_type, + vg.gks_proposition_type, + vg.clinical_impact_assertion_type, + vg.clinical_impact_clinical_significance, + vg.rank + ) st; + + CREATE TEMP TABLE _SESSION.release_end_vg + AS + SELECT + en.end_release_date, + en.variation_id, + en.statement_type, + en.gks_proposition_type, + en.clinical_impact_assertion_type, + en.clinical_impact_clinical_significance, + en.rank, + row_number () OVER ( + ORDER BY + en.variation_id, + en.statement_type, + en.gks_proposition_type, + en.clinical_impact_assertion_type, + en.clinical_impact_clinical_significance, + en.rank, + en.end_release_date ASC NULLS LAST + ) as rownum + FROM ( + SELECT + end_release_date, + variation_id, + statement_type, + gks_proposition_type, + clinical_impact_assertion_type, + clinical_impact_clinical_significance, + rank + FROM `clinvar_ingest.voi_group` vg + UNION DISTINCT + SELECT + MAX(r.release_date) as end_release_date, + variation_id, + statement_type, + gks_proposition_type, + clinical_impact_assertion_type, + clinical_impact_clinical_significance, + rank + FROM `clinvar_ingest.voi_group` vg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date < vg.start_release_date + GROUP BY + vg.start_release_date, + vg.variation_id, + vg.statement_type, + vg.gks_proposition_type, + vg.clinical_impact_assertion_type, + vg.clinical_impact_clinical_significance, + vg.rank + ) en; + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_group_change` + AS + SELECT + e.variation_id, + e.statement_type, + e.gks_proposition_type, + e.clinical_impact_assertion_type, + e.clinical_impact_clinical_significance, + e.rank, + s.start_release_date, + e.end_release_date + FROM _SESSION.release_start_vg s + JOIN _SESSION.release_end_vg e + ON + e.rownum = s.rownum + 1 + WHERE + e.variation_id = s.variation_id; + + DROP TABLE _SESSION.release_start_vg; + DROP TABLE _SESSION.release_end_vg; + +END; diff --git a/scripts/temporal-data-collection/14-voi-top-group-change-proc.sql b/scripts/temporal-data-summation/05-voi-top-group-change-proc.sql similarity index 64% rename from scripts/temporal-data-collection/14-voi-top-group-change-proc.sql rename to scripts/temporal-data-summation/05-voi-top-group-change-proc.sql index 7280fd0..569610d 100644 --- a/scripts/temporal-data-collection/14-voi-top-group-change-proc.sql +++ b/scripts/temporal-data-summation/05-voi-top-group-change-proc.sql @@ -2,9 +2,9 @@ CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_top_group_change`() BEGIN - CREATE TEMP TABLE _SESSION.voi_top_group AS - WITH - x AS ( + CREATE TEMP TABLE _SESSION.voi_top_group + AS + WITH x AS ( SELECT variation_id, rpt_stmt_type, @@ -17,7 +17,8 @@ BEGIN variation_id, start_release_date, end_release_date, - rpt_stmt_type) + rpt_stmt_type + ) SELECT x.start_release_date, x.end_release_date, @@ -26,31 +27,37 @@ BEGIN x.top_rank FROM x; - CREATE TEMP TABLE _SESSION.release_start_tg AS - select + CREATE TEMP TABLE _SESSION.release_start_tg + AS + SELECT st.start_release_date, st.variation_id, st.rpt_stmt_type, st.top_rank, - row_number () over (order by st.variation_id, st.rpt_stmt_type, st.start_release_date asc nulls first) as rownum - from ( - select + row_number () OVER ( + ORDER BY + st.variation_id, + st.rpt_stmt_type, + st.start_release_date ASC NULLS FIRST + ) as rownum + FROM ( + SELECT vtg.start_release_date, vtg.variation_id, vtg.rpt_stmt_type, vtg.top_rank - from _SESSION.voi_top_group vtg + FROM _SESSION.voi_top_group vtg UNION DISTINCT - select + SELECT MIN(r.release_date) as start_release_date, vtg.variation_id, vtg.rpt_stmt_type, vtg.top_rank - from _SESSION.voi_top_group vtg - left join `clinvar_ingest.clinvar_releases` r - on + FROM _SESSION.voi_top_group vtg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON r.release_date > vtg.end_release_date - group by + GROUP BY vtg.end_release_date, vtg.variation_id, vtg.rpt_stmt_type, @@ -63,41 +70,49 @@ BEGIN en.variation_id, en.rpt_stmt_type, en.top_rank, - row_number () over (order by en.variation_id, en.rpt_stmt_type, en.end_release_date asc nulls last) as rownum - from ( - select + row_number () OVER ( + ORDER BY + en.variation_id, + en.rpt_stmt_type, + en.end_release_date ASC NULLS LAST + ) as rownum + FROM ( + SELECT vtg.end_release_date, vtg.variation_id, vtg.rpt_stmt_type, vtg.top_rank - from _SESSION.voi_top_group vtg + FROM _SESSION.voi_top_group vtg UNION DISTINCT - select + SELECT MAX(r.release_date) as end_release_date, vtg.variation_id, vtg.rpt_stmt_type, vtg.top_rank - from _SESSION.voi_top_group vtg - left join `clinvar_ingest.clinvar_releases` r - on + FROM _SESSION.voi_top_group vtg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON r.release_date < vtg.start_release_date - group by + GROUP BY vtg.start_release_date, vtg.variation_id, vtg.rpt_stmt_type, vtg.top_rank ) en; - CREATE OR REPLACE TABLE `clinvar_ingest.voi_top_group_change` AS - select + CREATE OR REPLACE TABLE `clinvar_ingest.voi_top_group_change` + AS + SELECT e.variation_id, e.rpt_stmt_type, e.top_rank, s.start_release_date, e.end_release_date - from _SESSION.release_start_tg s - join _SESSION.release_end_tg e on e.rownum = s.rownum + 1 - where e.variation_id = s.variation_id; + FROM _SESSION.release_start_tg s + JOIN _SESSION.release_end_tg e + ON + e.rownum = s.rownum + 1 + WHERE e.variation_id = s.variation_id; DROP TABLE _SESSION.voi_top_group; DROP TABLE _SESSION.release_start_tg; diff --git a/scripts/temporal-data-summation/05-voi-top-group-change-v2-proc.sql b/scripts/temporal-data-summation/05-voi-top-group-change-v2-proc.sql new file mode 100644 index 0000000..e46821d --- /dev/null +++ b/scripts/temporal-data-summation/05-voi-top-group-change-v2-proc.sql @@ -0,0 +1,164 @@ +-- create or replace voi_top_group_change +CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_top_group_change_v2`() +BEGIN + + CREATE TEMP TABLE _SESSION.voi_top_group + AS + WITH x AS ( + SELECT + variation_id, + statement_type, + gks_proposition_type, + clinical_impact_assertion_type, + clinical_impact_clinical_significance, + start_release_date, + end_release_date, + MAX(rank) AS top_rank + FROM + `clinvar_ingest.voi_group` vg + GROUP BY + variation_id, + start_release_date, + end_release_date, + statement_type, + gks_proposition_type, + clinical_impact_assertion_type, + clinical_impact_clinical_significance + ) + SELECT + x.start_release_date, + x.end_release_date, + x.variation_id, + x.statement_type, + x.gks_proposition_type, + x.clinical_impact_assertion_type, + x.clinical_impact_clinical_significance, + x.top_rank + FROM x; + + CREATE TEMP TABLE _SESSION.release_start_tg + AS + SELECT + st.start_release_date, + st.variation_id, + st.statement_type, + st.gks_proposition_type, + st.clinical_impact_assertion_type, + st.clinical_impact_clinical_significance, + st.top_rank, + row_number () OVER ( + ORDER BY + st.variation_id, + st.statement_type, + st.gks_proposition_type, + st.clinical_impact_assertion_type, + st.clinical_impact_clinical_significance, + st.start_release_date asc nulls first + ) as rownum + FROM ( + SELECT + vtg.start_release_date, + vtg.variation_id, + vtg.statement_type, + vtg.gks_proposition_type, + vtg.clinical_impact_assertion_type, + vtg.clinical_impact_clinical_significance, + vtg.top_rank + FROM _SESSION.voi_top_group vtg + UNION DISTINCT + SELECT + MIN(r.release_date) as start_release_date, + vtg.variation_id, + vtg.statement_type, + vtg.gks_proposition_type, + vtg.clinical_impact_assertion_type, + vtg.clinical_impact_clinical_significance, + vtg.top_rank + FROM _SESSION.voi_top_group vtg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date > vtg.end_release_date + GROUP BY + vtg.end_release_date, + vtg.variation_id, + vtg.statement_type, + vtg.gks_proposition_type, + vtg.clinical_impact_assertion_type, + vtg.clinical_impact_clinical_significance, + vtg.top_rank + ) st; + + CREATE TEMP TABLE _SESSION.release_end_tg AS + SELECT + en.end_release_date, + en.variation_id, + en.statement_type, + en.gks_proposition_type, + en.clinical_impact_assertion_type, + en.clinical_impact_clinical_significance, + en.top_rank, + row_number () over ( + ORDER BY + en.variation_id, + en.statement_type, + en.gks_proposition_type, + en.clinical_impact_assertion_type, + en.clinical_impact_clinical_significance, + en.end_release_date asc nulls last + ) as rownum + FROM ( + SELECT + vtg.end_release_date, + vtg.variation_id, + vtg.statement_type, + vtg.gks_proposition_type, + vtg.clinical_impact_assertion_type, + vtg.clinical_impact_clinical_significance, + vtg.top_rank + FROM _SESSION.voi_top_group vtg + UNION DISTINCT + SELECT + MAX(r.release_date) as end_release_date, + vtg.variation_id, + vtg.statement_type, + vtg.gks_proposition_type, + vtg.clinical_impact_assertion_type, + vtg.clinical_impact_clinical_significance, + vtg.top_rank + FROM _SESSION.voi_top_group vtg + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date < vtg.start_release_date + GROUP BY + vtg.start_release_date, + vtg.variation_id, + vtg.statement_type, + vtg.gks_proposition_type, + vtg.clinical_impact_assertion_type, + vtg.clinical_impact_clinical_significance, + vtg.top_rank + ) en; + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_top_group_change` + AS + SELECT + e.variation_id, + e.statement_type, + e.gks_proposition_type, + e.clinical_impact_assertion_type, + e.clinical_impact_clinical_significance, + e.top_rank, + s.start_release_date, + e.end_release_date + FROM _SESSION.release_start_tg s + JOIN _SESSION.release_end_tg e + ON + e.rownum = s.rownum + 1 + WHERE + e.variation_id = s.variation_id; + + DROP TABLE _SESSION.voi_top_group; + DROP TABLE _SESSION.release_start_tg; + DROP TABLE _SESSION.release_end_tg; + +END; \ No newline at end of file diff --git a/scripts/temporal-data-summation/06-voi-summary-change-proc.sql b/scripts/temporal-data-summation/06-voi-summary-change-proc.sql new file mode 100644 index 0000000..872c069 --- /dev/null +++ b/scripts/temporal-data-summation/06-voi-summary-change-proc.sql @@ -0,0 +1,92 @@ +-- calc voi summary +CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_summary_change`() +BEGIN + + CREATE TEMP TABLE _SESSION.voi_summary + AS + SELECT + variation_id, + start_release_date, + end_release_date + FROM + `clinvar_ingest.voi_top_group_change` vtg + GROUP BY + variation_id, + start_release_date, + end_release_date; + + CREATE TEMP TABLE _SESSION.release_start_vs + AS + SELECT + st.start_release_date, + st.variation_id, + row_number () OVER ( + ORDER BY + st.variation_id, + st.start_release_date ASC NULLS FIRST + ) as rownum + FROM ( + SELECT + vs.start_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + UNION DISTINCT + SELECT + MIN(r.release_date) as start_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date > vs.end_release_date + GROUP BY + vs.end_release_date, + vs.variation_id + ) st; + + CREATE TEMP TABLE _SESSION.release_end_vs + AS + SELECT + en.end_release_date, + en.variation_id, + row_number () OVER ( + ORDER BY + en.variation_id, + en.end_release_date ASC NULLS LAST + ) as rownum + FROM ( + SELECT + vs.end_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + UNION DISTINCT + SELECT + MAX(r.release_date) as end_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date < vs.start_release_date + GROUP BY + vs.start_release_date, + vs.variation_id + ) en; + + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_summary_change` + AS + SELECT + e.variation_id, + s.start_release_date, + e.end_release_date + FROM _SESSION.release_start_vs s + JOIN _SESSION.release_end_vs e + ON + e.rownum = s.rownum + 1 + WHERE + e.variation_id = s.variation_id; + + DROP TABLE _SESSION.voi_summary; + DROP TABLE _SESSION.release_start_vs; + DROP TABLE _SESSION.release_end_vs; + +END; \ No newline at end of file diff --git a/scripts/temporal-data-summation/06-voi-summary-change-v2-proc.sql b/scripts/temporal-data-summation/06-voi-summary-change-v2-proc.sql new file mode 100644 index 0000000..b954855 --- /dev/null +++ b/scripts/temporal-data-summation/06-voi-summary-change-v2-proc.sql @@ -0,0 +1,91 @@ +-- calc voi summary +CREATE OR REPLACE PROCEDURE `clinvar_ingest.voi_summary_change_v2`() +BEGIN + + CREATE TEMP TABLE _SESSION.voi_summary + AS + SELECT + variation_id, + start_release_date, + end_release_date + FROM + `clinvar_ingest.voi_top_group_change` vtg + GROUP BY + variation_id, + start_release_date, + end_release_date; + + CREATE TEMP TABLE _SESSION.release_start_vs + AS + SELECT + st.start_release_date, + st.variation_id, + row_number () OVER ( + ORDER BY + st.variation_id, + st.start_release_date ASC NULLS FIRST + ) as rownum + FROM ( + SELECT + vs.start_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + UNION DISTINCT + SELECT + MIN(r.release_date) as start_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + LEFT join `clinvar_ingest.clinvar_releases` r + ON + r.release_date > vs.end_release_date + GROUP BY + vs.end_release_date, + vs.variation_id + ) st; + + CREATE TEMP TABLE _SESSION.release_end_vs + AS + SELECT + en.end_release_date, + en.variation_id, + row_number () OVER ( + ORDER BY + en.variation_id, + en.end_release_date ASC NULLS LAST + ) as rownum + FROM ( + SELECT + vs.end_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + UNION DISTINCT + SELECT + MAX(r.release_date) as end_release_date, + vs.variation_id + FROM _SESSION.voi_summary vs + LEFT JOIN `clinvar_ingest.clinvar_releases` r + ON + r.release_date < vs.start_release_date + GROUP BY + vs.start_release_date, + vs.variation_id + ) en; + + CREATE OR REPLACE TABLE `clinvar_ingest.voi_summary_change` + AS + SELECT + e.variation_id, + s.start_release_date, + e.end_release_date + FROM _SESSION.release_start_vs s + JOIN _SESSION.release_end_vs e + ON + e.rownum = s.rownum + 1 + WHERE + e.variation_id = s.variation_id; + + DROP TABLE _SESSION.voi_summary; + DROP TABLE _SESSION.release_start_vs; + DROP TABLE _SESSION.release_end_vs; + +END; \ No newline at end of file diff --git a/scripts/temporal-data-summation/temporal-data-summation-proc.sql b/scripts/temporal-data-summation/temporal-data-summation-proc.sql new file mode 100644 index 0000000..0f49498 --- /dev/null +++ b/scripts/temporal-data-summation/temporal-data-summation-proc.sql @@ -0,0 +1,9 @@ +CREATE OR REPLACE PROCEDURE `clingen-dev.clinvar_ingest.temporal_data_summation`() +BEGIN + CALL `clinvar_ingest.clinvar_var_scv_change`(); + CALL `clinvar_ingest.voi_vcv_scv`(); + CALL `clinvar_ingest.voi_and_voi_scv_group`(); + CALL `clinvar_ingest.voi_group_change`(); + CALL `clinvar_ingest.voi_top_group_change`(); + CALL `clinvar_ingest.voi_summary_change`(); +END; \ No newline at end of file diff --git a/scripts/temporal-data-summation/temporal-data-summation-v2-proc.sql b/scripts/temporal-data-summation/temporal-data-summation-v2-proc.sql new file mode 100644 index 0000000..f7406e0 --- /dev/null +++ b/scripts/temporal-data-summation/temporal-data-summation-v2-proc.sql @@ -0,0 +1,9 @@ +CREATE OR REPLACE PROCEDURE `clingen-dev.clinvar_ingest.temporal_data_summation_v2`() +BEGIN + CALL `clinvar_ingest.clinvar_var_scv_change`(); + CALL `clinvar_ingest.voi_vcv_scv_v2`(); + CALL `clinvar_ingest.voi_and_voi_scv_group_v2`(); + CALL `clinvar_ingest.voi_group_change_v2`(); + CALL `clinvar_ingest.voi_top_group_change_v2`(); + CALL `clinvar_ingest.voi_summary_change_v2`(); +END; \ No newline at end of file diff --git a/scripts/tracker-report-update/02-tracker-reports-rebuild-proc.sql b/scripts/tracker-report-update/02-tracker-reports-rebuild-proc.sql index 5335b4d..4612c3a 100644 --- a/scripts/tracker-report-update/02-tracker-reports-rebuild-proc.sql +++ b/scripts/tracker-report-update/02-tracker-reports-rebuild-proc.sql @@ -3,43 +3,43 @@ BEGIN DECLARE disable_out_of_date_alerts BOOLEAN DEFAULT FALSE; FOR rec IN ( - select + SELECT r.id, r.name, r.abbrev, lower(format("%s_%s", r.id, r.abbrev)) as tname, ARRAY_AGG( STRUCT(ro.name, ro.value) ) as opts - from `variation_tracker.report` r - join `variation_tracker.report_submitter` rs - on + FROM `variation_tracker.report` r + JOIN `variation_tracker.report_submitter` rs + ON rs.report_id = r.id and rs.active - left join `variation_tracker.report_option` ro - on + LEFT JOIN `variation_tracker.report_option` ro + ON ro.report_id = r.id - group by + GROUP BY r.id, r.name, r.abbrev ) DO - SET disable_out_of_date_alerts = - ( - SELECT - CAST( - IFNULL( - ( - SELECT - opt.value - FROM UNNEST(rec.opts) as opt - WHERE opt.name = "DISABLE_OUT_OF_DATE_ALERTS" - ), - "FALSE" - ) AS BOOL - ) - ); + SET disable_out_of_date_alerts = ( + SELECT + CAST( + IFNULL( + ( + SELECT + opt.value + FROM UNNEST(rec.opts) as opt + WHERE opt.name = "DISABLE_OUT_OF_DATE_ALERTS" + ), + "FALSE" + ) AS BOOL + ) + ); EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `variation_tracker.%s_variation` AS + CREATE OR REPLACE TABLE `variation_tracker.%s_variation` + AS SELECT rv.report_id, cv.release_date as report_release_date, @@ -53,12 +53,14 @@ BEGIN vg.variation_id = rv.variation_id JOIN `clinvar_ingest.all_schemas`() cv ON - cv.release_date between vg.start_release_date and vg.end_release_date - WHERE rv.report_id = "%s" + cv.release_date BETWEEN vg.start_release_date AND vg.end_release_date + WHERE + rv.report_id = "%s" """, rec.tname, rec.id); EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `variation_tracker.%s_scv` AS + CREATE OR REPLACE TABLE `variation_tracker.%s_scv` + AS SELECT rv.report_id, rv.report_release_date, @@ -74,21 +76,30 @@ BEGIN FROM `variation_tracker.%s_variation` rv JOIN `clinvar_ingest.voi_scv_group` vsg ON - vsg.variation_id = rv.variation_id AND - vsg.rpt_stmt_type = rv.rpt_stmt_type AND - vsg.rank = rv.rank AND - rv.report_release_date between vsg.start_release_date AND vsg.end_release_date + vsg.variation_id = rv.variation_id + AND + vsg.rpt_stmt_type = rv.rpt_stmt_type + AND + vsg.rank = rv.rank + AND + rv.report_release_date BETWEEN vsg.start_release_date AND vsg.end_release_date JOIN `clinvar_ingest.voi_scv` vs ON - vs.variation_id = vsg.variation_id AND - vs.id = vsg.id AND - vs.version = vsg.version AND - vs.rpt_stmt_type = vsg.rpt_stmt_type AND - vs.rank = vsg.rank AND - rv.report_release_date between vs.start_release_date and vs.end_release_date + vs.variation_id = vsg.variation_id + AND + vs.id = vsg.id + AND + vs.version = vsg.version + AND + vs.rpt_stmt_type IS NOT DISTINCT FROM vsg.rpt_stmt_type + AND + vs.rank IS NOT DISTINCT FROM vsg.rank + AND + rv.report_release_date BETWEEN vs.start_release_date AND vs.end_release_date LEFT JOIN `variation_tracker.report_submitter` rs ON - rs.report_id = rv.report_id AND + rs.report_id = rv.report_id + AND vs.submitter_id = rs.submitter_id """, rec.tname, rec.tname); @@ -99,16 +110,19 @@ BEGIN WHERE EXISTS ( SELECT scv.variation_id FROM `variation_tracker.%s_scv` scv - WHERE scv.report_submitter_submission - AND scv.report_release_date = v.report_release_date - AND v.variation_id = scv.variation_id + WHERE + scv.report_submitter_submission + AND + scv.report_release_date = v.report_release_date + AND + v.variation_id = scv.variation_id ) """, rec.tname, rec.tname); EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `variation_tracker.%s_alerts` AS - WITH x AS - ( + CREATE OR REPLACE TABLE `variation_tracker.%s_alerts` + AS + WITH x AS ( SELECT v.symbol as gene_symbol, v.name, @@ -136,27 +150,37 @@ BEGIN FROM `variation_tracker.%s_scv` scv JOIN `variation_tracker.%s_variation` var ON - scv.variation_id = var.variation_id and - scv.report_release_date = var.report_release_date and - scv.rpt_stmt_type = var.rpt_stmt_type and - scv.rank = var.rank + scv.variation_id = var.variation_id + AND + scv.report_release_date = var.report_release_date + AND + scv.rpt_stmt_type IS NOT DISTINCT FROM var.rpt_stmt_type + AND + scv.rank IS NOT DISTINCT FROM var.rank JOIN `clinvar_ingest.clinvar_status` revstat ON revstat.rank = scv.rank and revstat.scv JOIN `clinvar_ingest.voi` v ON - v.variation_id = scv.variation_id AND - scv.report_release_date between v.start_release_date and v.end_release_date + v.variation_id = scv.variation_id + AND + scv.report_release_date BETWEEN v.start_release_date AND v.end_release_date LEFT JOIN `clinvar_ingest.voi_vcv` vv ON - scv.variation_id =vv.variation_id and - scv.report_release_date between vv.start_release_date and vv.end_release_date - JOIN `clinvar_ingest.voi_scv` vs on - vs.variation_id = scv.variation_id AND - vs.id = scv.id AND - vs.version = scv.version AND - scv.report_release_date between vs.start_release_date and vs.end_release_date - WHERE var.report_submitter_variation + scv.variation_id =vv.variation_id + AND + scv.report_release_date BETWEEN vv.start_release_date AND vv.end_release_date + JOIN `clinvar_ingest.voi_scv` vs + ON + vs.variation_id = scv.variation_id + AND + vs.id = scv.id + AND + vs.version = scv.version + AND + scv.report_release_date BETWEEN vs.start_release_date AND vs.end_release_date + WHERE + var.report_submitter_variation ) SELECT vcep.gene_symbol, @@ -181,18 +205,18 @@ BEGIN vcep.last_eval_age as submitted_last_eval_age, vcep.released_date as submitted_released_date, vcep.released_age as submitted_released_age, - case - when (vcep.clinsig_type = 2 AND other.clinsig_type <> 2) THEN - "P/LP vs Newer VUS/B/LB" - when (vcep.clinsig_type = 1 AND other.clinsig_type = 2) THEN - "VUS vs Newer P/LP" - when (vcep.clinsig_type = 1 AND other.clinsig_type = 0) THEN - "VUS vs Newer B/LB" - when (vcep.clinsig_type = 0 AND other.clinsig_type = 1) THEN - "B/LB vs Newer VUS" - when (vcep.clinsig_type = 0 AND other.clinsig_type = 2) THEN - "B/LB vs Newer P/LP" - end as alert_type, + CASE + WHEN (vcep.clinsig_type = 2 AND other.clinsig_type <> 2) THEN + "P/LP vs Newer VUS/B/LB" + WHEN (vcep.clinsig_type = 1 AND other.clinsig_type = 2) THEN + "VUS vs Newer P/LP" + WHEN (vcep.clinsig_type = 1 AND other.clinsig_type = 0) THEN + "VUS vs Newer B/LB" + WHEN (vcep.clinsig_type = 0 AND other.clinsig_type = 1) THEN + "B/LB vs Newer VUS" + WHEN (vcep.clinsig_type = 0 AND other.clinsig_type = 2) THEN + "B/LB vs Newer P/LP" + END as alert_type, other.id as other_scv_id, other.version as other_scv_version, other.full_scv_id as other_full_scv_id, @@ -215,14 +239,19 @@ BEGIN FROM x as vcep JOIN x as other ON - other.variation_id = vcep.variation_id AND - other.rpt_stmt_type = vcep.rpt_stmt_type AND - NOT other.report_submitter_submission AND - other.report_release_date = vcep.report_release_date AND + other.variation_id = vcep.variation_id + AND + other.rpt_stmt_type = vcep.rpt_stmt_type + AND + NOT other.report_submitter_submission + AND + other.report_release_date = vcep.report_release_date + AND other.clinsig_type <> vcep.clinsig_type -- -- find all other submissions that have a last eval that is newer than 1 year prior to the EPs submission's last eval date WHERE - vcep.report_submitter_submission AND + vcep.report_submitter_submission + AND (vcep.last_eval_age - other.last_eval_age) >= 0 UNION ALL SELECT @@ -270,14 +299,18 @@ BEGIN null as newer_released_age FROM x as vcep WHERE - NOT %t AND - vcep.report_submitter_submission AND - vcep.last_eval_age >= 730 AND + NOT %t + AND + vcep.report_submitter_submission + AND + vcep.last_eval_age >= 730 + AND vcep.classif_type NOT IN ('p','lb','b') """, rec.tname, rec.tname, rec.tname, disable_out_of_date_alerts); EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `variation_tracker.%s_var_priorities` AS + CREATE OR REPLACE TABLE `variation_tracker.%s_var_priorities` + AS WITH x AS ( SELECT @@ -288,33 +321,38 @@ BEGIN sum(vg.sig_type[OFFSET(1)].count) as unc_sig_cnt, sum(vg.sig_type[OFFSET(2)].count) as sig_cnt, CASE - WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN - 7 - WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN - 6 - WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN - 5 - WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN - 4 - WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN - 3 - WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN - 2 - WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN - 1 - ELSE - 0 - END as agg_sig_type, - max(vg.rank) as max_rank + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 7 + WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 6 + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 5 + WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 4 + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN + 3 + WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN + 2 + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN + 1 + ELSE + 0 + END as agg_sig_type, + MAX(vg.rank) as max_rank FROM `variation_tracker.%s_variation` v JOIN `clinvar_ingest.voi_group` vg ON - v.variation_id = vg.variation_id and - v.rpt_stmt_type = vg.rpt_stmt_type and - v.rank = vg.rank and - v.report_release_date between vg.start_release_date and vg.end_release_date - WHERE NOT v.report_submitter_variation - AND v.rpt_stmt_type = 'path' + v.variation_id = vg.variation_id + AND + v.rpt_stmt_type = vg.rpt_stmt_type + AND + v.rank = vg.rank + AND + v.report_release_date BETWEEN vg.start_release_date AND vg.end_release_date + WHERE + NOT v.report_submitter_variation + AND + v.rpt_stmt_type = 'path' GROUP BY v.variation_id, v.rpt_stmt_type, @@ -334,14 +372,18 @@ BEGIN select IF(x.max_rank = 0 and x.agg_sig_type >= 4, 'No criteria PLP', NULL)),','),',') as priority_type FROM x WHERE ( - (x.agg_sig_type = 2 AND x.unc_sig_cnt > 2) OR - (x.agg_sig_type IN ( 3, 7 )) OR - (x.agg_sig_type > 4) OR + (x.agg_sig_type = 2 AND x.unc_sig_cnt > 2) + OR + (x.agg_sig_type IN ( 3, 7 )) + OR + (x.agg_sig_type > 4) + OR (x.max_rank = 0 and x.agg_sig_type >= 4)) """, rec.tname, rec.tname); EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `variation_tracker.%s_scv_priorities` AS + CREATE OR REPLACE TABLE `variation_tracker.%s_scv_priorities` + AS SELECT vp.report_release_date, vp.variation_id, @@ -366,23 +408,30 @@ BEGIN CROSS JOIN UNNEST(vp.priority_type) as p_type JOIN `variation_tracker.%s_scv` scv ON - vp.variation_id = scv.variation_id and + vp.variation_id = scv.variation_id + AND vp.report_release_date = scv.report_release_date JOIN `clinvar_ingest.voi_scv_group` sgrp ON - scv.id = sgrp.id and - scv.version = sgrp.version and - scv.rpt_stmt_type = sgrp.rpt_stmt_type and - scv.rank = sgrp.rank and - scv.report_release_date between sgrp.start_release_date and sgrp.end_release_date + scv.id = sgrp.id + AND + scv.version = sgrp.version + AND + scv.rpt_stmt_type = sgrp.rpt_stmt_type + AND + scv.rank = sgrp.rank + AND + scv.report_release_date BETWEEN sgrp.start_release_date AND sgrp.end_release_date JOIN `clinvar_ingest.voi` v ON - vp.variation_id = v.variation_id and - vp.report_release_date between v.start_release_date and v.end_release_date + vp.variation_id = v.variation_id + AND + vp.report_release_date BETWEEN v.start_release_date AND v.end_release_date LEFT JOIN `clinvar_ingest.voi_vcv` vv ON - vp.variation_id =vv.variation_id and - vp.report_release_date between vv.start_release_date and vv.end_release_date + vp.variation_id =vv.variation_id + AND + vp.report_release_date BETWEEN vv.start_release_date AND vv.end_release_date JOIN ( select diff --git a/scripts/tracker-report-update/02-tracker-reports-rebuild-v2-proc.sql b/scripts/tracker-report-update/02-tracker-reports-rebuild-v2-proc.sql new file mode 100644 index 0000000..7766020 --- /dev/null +++ b/scripts/tracker-report-update/02-tracker-reports-rebuild-v2-proc.sql @@ -0,0 +1,499 @@ +CREATE OR REPLACE PROCEDURE `variation_tracker.tracker_reports_rebuild_v2`() +BEGIN + DECLARE disable_out_of_date_alerts BOOLEAN DEFAULT FALSE; + FOR rec IN + ( + SELECT + r.id, + r.name, + r.abbrev, + lower(format("%s_%s", r.id, r.abbrev)) as tname, + ARRAY_AGG( STRUCT(ro.name, ro.value) ) as opts + FROM `variation_tracker.report` r + JOIN `variation_tracker.report_submitter` rs + ON + rs.report_id = r.id and rs.active + LEFT JOIN `variation_tracker.report_option` ro + ON + ro.report_id = r.id + GROUP BY + r.id, + r.name, + r.abbrev + ) + DO + SET disable_out_of_date_alerts = ( + SELECT + CAST( + IFNULL( + ( + SELECT + opt.value + FROM UNNEST(rec.opts) as opt + WHERE opt.name = "DISABLE_OUT_OF_DATE_ALERTS" + ), + "FALSE" + ) AS BOOL + ) + ); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `variation_tracker.%s_variation` + AS + SELECT + rv.report_id, + cv.release_date as report_release_date, + rv.variation_id, + vg.statement_type, + vg.gks_proposition_type, + vg.clinical_impact_assertion_type, + vg.clinical_impact_clinical_significance, + vg.rank, + FALSE as report_submitter_variation + FROM `variation_tracker.report_variation` rv + JOIN `clinvar_ingest.voi_group` vg + ON + vg.variation_id = rv.variation_id + JOIN `clinvar_ingest.all_schemas`() cv + ON + cv.release_date BETWEEN vg.start_release_date AND vg.end_release_date + WHERE + rv.report_id = "%s" + """, rec.tname, rec.id); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `variation_tracker.%s_scv` + AS + SELECT + rv.report_id, + rv.report_release_date, + rv.variation_id, + rv.statement_type, + rv.gks_proposition_type, + rv.clinical_impact_assertion_type, + rv.clinical_impact_clinical_significance, + rv.rank, + vsg.id, + vsg.version, + DATE_DIFF(rv.report_release_date, vs.last_evaluated, DAY) as last_eval_age, + DATE_DIFF(rv.report_release_date, vs.start_release_date, DAY) as released_age, + DATE_DIFF(rv.report_release_date, vs.submission_date, DAY) as submission_age, + (rs.submitter_id is not NULL) as report_submitter_submission, + FROM `variation_tracker.%s_variation` rv + JOIN `clinvar_ingest.voi_scv_group` vsg + ON + vsg.variation_id = rv.variation_id + AND + vsg.statement_type IS NOT DISTINCT FROM rv.statement_type + AND + vsg.gks_proposition_type IS NOT DISTINCT FROM rv.gks_proposition_type + AND + vsg.clinical_impact_assertion_type IS NOT DISTINCT FROM rv.clinical_impact_assertion_type + AND + vsg.clinical_impact_clinical_significance IS NOT DISTINCT FROM rv.clinical_impact_clinical_significance + AND + vsg.rank IS NOT DISTINCT FROM rv.rank + AND + rv.report_release_date BETWEEN vsg.start_release_date AND vsg.end_release_date + JOIN `clinvar_ingest.voi_scv` vs + ON + vs.variation_id = vsg.variation_id + AND + vs.id = vsg.id + AND + vs.version = vsg.version + AND + vs.statement_type IS NOT DISTINCT FROM vsg.statement_type + AND + vs.gks_proposition_type IS NOT DISTINCT FROM vsg.gks_proposition_type + AND + vs.clinical_impact_assertion_type IS NOT DISTINCT FROM vsg.clinical_impact_assertion_type + AND + vs.clinical_impact_clinical_significance IS NOT DISTINCT FROM vsg.clinical_impact_clinical_significance + AND + vs.rank IS NOT DISTINCT FROM vsg.rank + AND + rv.report_release_date BETWEEN vs.start_release_date AND vs.end_release_date + LEFT JOIN `variation_tracker.report_submitter` rs + ON + rs.report_id = rv.report_id + AND + vs.submitter_id = rs.submitter_id + """, rec.tname, rec.tname); + + -- add convenience control attribute to represent variations that the report_submitter has submitted on at a given point in time + EXECUTE IMMEDIATE FORMAT(""" + UPDATE `variation_tracker.%s_variation` v + SET report_submitter_variation = TRUE + WHERE EXISTS ( + SELECT scv.variation_id + FROM `variation_tracker.%s_scv` scv + WHERE + scv.report_submitter_submission + AND + scv.report_release_date IS NOT DISTINCT FROM v.report_release_date + AND + v.variation_id = scv.variation_id + ) + """, rec.tname, rec.tname); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `variation_tracker.%s_alerts` + AS + WITH x AS + ( + SELECT + v.symbol as gene_symbol, + v.name, + scv.variation_id, + vv.id||'.'||vv.version as full_vcv_id, + scv.statement_type, + scv.gks_proposition_type, + scv.clinical_impact_assertion_type, + scv.clinical_impact_clinical_significance, + scv.rank, + scv.report_release_date, + scv.id, scv.version, + vs.full_scv_id, + revstat.label as review_status, + vs.submitter_id, + vs.submission_date, + scv.submission_age, + vs.last_evaluated, + scv.last_eval_age, + vs.start_release_date as released_date, + scv.released_age, + vs.clinsig_type, + vs.classif_type, + vs.classification_abbrev, + vs.submitter_name, + vs.submitter_abbrev, + scv.report_submitter_submission + FROM `variation_tracker.%s_scv` scv + JOIN `variation_tracker.%s_variation` var + ON + scv.variation_id = var.variation_id + AND + scv.report_release_date IS NOT DISTINCT FROM var.report_release_date + AND + scv.statement_type IS NOT DISTINCT FROM var.statement_type + AND + scv.gks_proposition_type IS NOT DISTINCT FROM var.gks_proposition_type + AND + scv.clinical_impact_assertion_type IS NOT DISTINCT FROM var.clinical_impact_assertion_type + AND + scv.clinical_impact_clinical_significance IS NOT DISTINCT FROM var.clinical_impact_clinical_significance + AND + scv.rank IS NOT DISTINCT FROM var.rank + JOIN `clinvar_ingest.clinvar_status` revstat + ON + revstat.rank = scv.rank and revstat.scv + JOIN `clinvar_ingest.voi` v + ON + v.variation_id = scv.variation_id + AND + scv.report_release_date BETWEEN v.start_release_date AND v.end_release_date + LEFT JOIN `clinvar_ingest.voi_vcv` vv + ON + scv.variation_id = vv.variation_id + AND + scv.report_release_date BETWEEN vv.start_release_date AND vv.end_release_date + JOIN `clinvar_ingest.voi_scv` vs + ON + vs.variation_id = scv.variation_id + AND + vs.id = scv.id + AND + vs.version = scv.version + AND + scv.report_release_date BETWEEN vs.start_release_date AND vs.end_release_date + WHERE + var.report_submitter_variation + ) + SELECT + vcep.gene_symbol, + vcep.name, + vcep.variation_id, + vcep.full_vcv_id, + vcep.report_release_date, + vcep.statement_type, + vcep.gks_proposition_type, + vcep.clinical_impact_assertion_type, + vcep.clinical_impact_clinical_significance, + vcep.id as submitted_scv_id, + vcep.version as submitted_scv_version, + vcep.full_scv_id as submitted_full_scv_id, + vcep.rank as submitted_rank, + vcep.review_status as submitted_review_status, + vcep.clinsig_type as submitted_clinsig_type, + vcep.classif_type as submitted_classif_type, + vcep.submitter_abbrev as submitted_submitter_abbrev, + vcep.submitter_name as submitted_submitter_name, + vcep.classification_abbrev as submitted_classif_abbrev, + vcep.submission_date as submitted_submission_date, + vcep.submission_age as submitted_submission_age, + vcep.last_evaluated as submitted_last_evaluated, + vcep.last_eval_age as submitted_last_eval_age, + vcep.released_date as submitted_released_date, + vcep.released_age as submitted_released_age, + CASE + WHEN (vcep.clinsig_type = 2 AND other.clinsig_type <> 2) THEN + "P/LP vs Newer VUS/B/LB" + WHEN (vcep.clinsig_type = 1 AND other.clinsig_type = 2) THEN + "VUS vs Newer P/LP" + WHEN (vcep.clinsig_type = 1 AND other.clinsig_type = 0) THEN + "VUS vs Newer B/LB" + WHEN (vcep.clinsig_type = 0 AND other.clinsig_type = 1) THEN + "B/LB vs Newer VUS" + WHEN (vcep.clinsig_type = 0 AND other.clinsig_type = 2) THEN + "B/LB vs Newer P/LP" + END as alert_type, + other.id as other_scv_id, + other.version as other_scv_version, + other.full_scv_id as other_full_scv_id, + other.rank as other_rank, + other.review_status as other_review_status, + other.clinsig_type as other_clinsig_type, + other.classif_type as other_classif_type, + other.submitter_abbrev as other_submitter_abbrev, + other.submitter_name as other_submitter_name, + other.classification_abbrev as other_classif_abbrev, + other.submission_date as other_submission_date, + other.submission_age as other_submission_age, + other.last_evaluated as other_last_evaluated, + other.last_eval_age as other_last_eval_age, + other.released_date as other_released_date, + other.released_age as other_released_age, + (vcep.submission_age - other.submission_age) as newer_submission_age, + (vcep.last_eval_age - other.last_eval_age) as newer_last_eval_age, + (vcep.released_age - other.released_age) as newer_released_age + FROM x as vcep + JOIN x as other + ON + other.variation_id = vcep.variation_id + AND + other.statement_type IS NOT DISTINCT FROM vcep.statement_type + AND + other.gks_proposition_type IS NOT DISTINCT FROM vcep.gks_proposition_type + AND + other.clinical_impact_assertion_type IS NOT DISTINCT FROM vcep.clinical_impact_assertion_type + AND + other.clinical_impact_clinical_significance IS NOT DISTINCT FROM vcep.clinical_impact_clinical_significance + AND + NOT other.report_submitter_submission + AND + other.report_release_date IS NOT DISTINCT FROM vcep.report_release_date + AND + other.clinsig_type IS DISTINCT FROM vcep.clinsig_type + -- -- find all other submissions that have a last eval that is newer than 1 year prior to the EPs submission's last eval date + WHERE + vcep.report_submitter_submission + AND + (vcep.last_eval_age - other.last_eval_age) >= 0 + UNION ALL + SELECT + vcep.gene_symbol, + vcep.name, + vcep.variation_id, + vcep.full_vcv_id, + vcep.report_release_date, + vcep.rpt_stmt_type, + vcep.id as submitted_scv_id, + vcep.version as submitted_scv_version, + vcep.full_scv_id as submitted_full_scv_id, + vcep.rank as submitted_rank, + vcep.review_status as submitted_review_status, + vcep.clinsig_type as submitted_clinsig_type, + vcep.classif_type as submitted_classif_type, + vcep.submitter_abbrev as submitted_submitter_abbrev, + vcep.submitter_name as submitted_submitter_name, + vcep.classification_abbrev as submitted_classif_abbrev, + vcep.submission_date as submitted_submission_date, + vcep.submission_age as submitted_submission_age, + vcep.last_evaluated as submitted_last_evaluated, + vcep.last_eval_age as submitted_last_eval_age, + vcep.released_date as submitted_released_date, + vcep.released_age as submitted_released_age, + "Out of Date" as alert_type, + null as other_scv_id, + null as other_scv_version, + null as other_full_scv_id, + null as other_rank, + null as other_review_status, + null as other_clinsig_type, + null as other_classif_type, + null as other_submitter_abbrev, + null as other_submitter_name, + null as other_classif_abbrev, + null as other_submission_date, + null as other_submission_age, + null as other_last_evaluated, + null as other_last_eval_age, + null as other_released_date, + null as other_released_age, + null as newer_submission_age, + null as newer_last_eval_age, + null as newer_released_age + FROM x as vcep + WHERE + NOT %t + AND + vcep.report_submitter_submission + AND + vcep.last_eval_age >= 730 + AND + vcep.classif_type NOT IN ('p','lb','b') + """, rec.tname, rec.tname, rec.tname, disable_out_of_date_alerts); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `variation_tracker.%s_var_priorities` + AS + WITH x AS + ( + SELECT + v.variation_id, + v.rpt_stmt_type, + v.report_release_date, + sum(vg.sig_type[OFFSET(0)].count) as no_sig_cnt, + sum(vg.sig_type[OFFSET(1)].count) as unc_sig_cnt, + sum(vg.sig_type[OFFSET(2)].count) as sig_cnt, + CASE + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 7 + WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 6 + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 5 + WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)>0) THEN + 4 + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN + 3 + WHEN (sum(vg.sig_type[OFFSET(0)].count)=0 AND sum(vg.sig_type[OFFSET(1)].count)>0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN + 2 + WHEN (sum(vg.sig_type[OFFSET(0)].count)>0 AND sum(vg.sig_type[OFFSET(1)].count)=0 AND sum(vg.sig_type[OFFSET(2)].count)=0) THEN + 1 + ELSE + 0 + END as agg_sig_type, + MAX(vg.rank) as max_rank + FROM `variation_tracker.%s_variation` v + JOIN `clinvar_ingest.voi_group` vg + ON + v.variation_id = vg.variation_id + AND + v.rpt_stmt_type = vg.rpt_stmt_type + AND + v.rank = vg.rank + AND + v.report_release_date BETWEEN vg.start_release_date AND vg.end_release_date + WHERE + NOT v.report_submitter_variation + AND + v.rpt_stmt_type = 'path' + GROUP BY + v.variation_id, + v.rpt_stmt_type, + v.report_release_date + ) + SELECT + x.variation_id, + x.rpt_stmt_type, + x.report_release_date, + x.agg_sig_type, + x.no_sig_cnt, x.unc_sig_cnt, x.sig_cnt, + x.max_rank, + SPLIT(ARRAY_TO_STRING(ARRAY( + select IF(x.agg_sig_type = 2 AND x.unc_sig_cnt > 2,'VUS priority',NULL) UNION ALL + select IF(x.agg_sig_type IN ( 3, 7 ), 'VUS vs LBB', NULL) UNION ALL + select IF(x.agg_sig_type > 4, 'PLP vs VUSLBB', NULL) UNION ALL + select IF(x.max_rank = 0 and x.agg_sig_type >= 4, 'No criteria PLP', NULL)),','),',') as priority_type + FROM x + WHERE ( + (x.agg_sig_type = 2 AND x.unc_sig_cnt > 2) + OR + (x.agg_sig_type IN ( 3, 7 )) + OR + (x.agg_sig_type > 4) + OR + (x.max_rank = 0 and x.agg_sig_type >= 4)) + """, rec.tname, rec.tname); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `variation_tracker.%s_scv_priorities` + AS + SELECT + vp.report_release_date, + vp.variation_id, + vp.rpt_stmt_type, + vp.max_rank, + p_type, + scv.rank as scv_rank, + scv.id as scv_id, + scv.version as scv_ver, + sgrp.outlier_pct, + sgrp.scv_group_type, + sgrp.scv_label, + v.name, + v.symbol as gene_symbol, + v.mane_select, + vv.id as vcv_id, + vv.version as vcv_ver, + vv.rank as vcv_rank, + vv.agg_classification as vcv_classification, + rel.next_release_date + FROM `variation_tracker.%s_var_priorities` vp + CROSS JOIN UNNEST(vp.priority_type) as p_type + JOIN `variation_tracker.%s_scv` scv + ON + vp.variation_id = scv.variation_id + AND + vp.report_release_date = scv.report_release_date + JOIN `clinvar_ingest.voi_scv_group` sgrp + ON + scv.id = sgrp.id + AND + scv.version = sgrp.version + AND + scv.rpt_stmt_type IS NOT DISTINCT FROM sgrp.rpt_stmt_type + AND + scv.rank IS NOT DISTINCT FROM sgrp.rank + AND + scv.report_release_date BETWEEN sgrp.start_release_date AND sgrp.end_release_date + JOIN `clinvar_ingest.voi` v + ON + vp.variation_id = v.variation_id + AND + vp.report_release_date BETWEEN v.start_release_date AND v.end_release_date + LEFT JOIN `clinvar_ingest.voi_vcv` vv + ON + vp.variation_id =vv.variation_id + AND + vp.report_release_date BETWEEN vv.start_release_date AND vv.end_release_date + JOIN + ( + select + release_date, + IF(next_release_date = DATE'9999-12-31', CURRENT_DATE(), next_release_date) next_release_date + FROM `clinvar_ingest.schemas_on_or_after`(clinvar_ingest.cvc_project_start_date()) + ) rel + ON + vp.report_release_date = rel.release_date + """, rec.tname, rec.tname, rec.tname); + + END FOR; + +END; + + + +-- https://stackoverflow.com/questions/325933/determine-whether-two-date-ranges-overlap +-- (s1 <= eX) AND (e1 >= sX) + +-- A s1--------------e1 +-- |----|----|----|----|----|----|----| +-- B s2------e2 s1 <= e2 AND e1 >= s2. TRUE +-- C s3------------------------e3 s1 <= e3 AND e1 >= s3. TRUE +-- D s4------e4 s1 <= e4 AND e1 >= s4. TRUE +-- E s5----------e5 s1 <= e5 AND e1 >= s5. TRUE +-- F s6--e6 s1 <= e6 AND e1 >= s6. FALSE +-- G. s7--e7 s1 <= e7 AND e1 >= s7. FALSE \ No newline at end of file diff --git a/scripts/tracker-report-update/03-gc-tracker-report-proc.sql b/scripts/tracker-report-update/03-gc-tracker-report-proc.sql index 7d45ec5..e0c5a43 100644 --- a/scripts/tracker-report-update/03-gc-tracker-report-proc.sql +++ b/scripts/tracker-report-update/03-gc-tracker-report-proc.sql @@ -1,14 +1,21 @@ -CREATE OR REPLACE PROCEDURE `variation_tracker.gc_tracker_report_rebuild`( - schema_name STRING, - release_date DATE -) +CREATE OR REPLACE PROCEDURE `variation_tracker.gc_tracker_report_rebuild`() BEGIN + DECLARE release_date DATE; + DECLARE + SET release_date = ( + SELECT max(end_release_date) + FROM `clinvar_ingest.voi_scv_group` + ); + SET schema_name = ( + SELECT schema_name + FROM `clinvar_ingest.schema_on`(release_date) + ); -- vceps for current release EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TEMP TABLE vcep AS - select + SELECT scv.variation_id, scv.submitter_id, FORMAT("%%s.%%i", scv.id, scv.version) as scv_acxn, @@ -16,166 +23,172 @@ BEGIN scv.classif_type, scv.submitted_classification, scv.last_evaluated - from `variation_tracker.report_submitter` rs - join `%s.scv_summary` scv - on + FROM `variation_tracker.report_submitter` rs + JOIN `%s.scv_summary` scv + ON scv.submitter_id = rs.submitter_id - where + WHERE rs.type = "VCEP" - and - rs.submitter_id is not null + AND + rs.submitter_id IS NOT NULL """, schema_name); -- gc scv info for current release EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TEMP TABLE gc_scv AS - select - scv.submitter_id, - scv.variation_id, - gscv.id, - FORMAT("%%s.%%i", gscv.id, gscv.version) as scv_acxn, - IF(scv.local_key IS NULL, NULL, SPLIT(scv.local_key, "|")[0]) as local_key, - scv.local_key as local_key_orig, - scv.date_created as first_in_clinvar, - scv.classification_comment, - COUNT(IFNULL(gscv.lab_id,gscv.lab_name)) as case_count - from `%s.gc_scv` gscv - join `%s.scv_summary` scv - on - scv.id = gscv.id - where - -- these are the dupe gc submissions that are older - gscv.id not in ( - "SCV000607136","SCV000986740", - "SCV000986708","SCV000986786", - "SCV000986705","SCV000986788", - "SCV000986813","SCV000607109" - ) - group by - scv.submitter_id, - scv.variation_id, - gscv.id, - gscv.version, - scv.local_key, - scv.date_created, - scv.classification_comment + SELECT + scv.submitter_id, + scv.variation_id, + gscv.id, + FORMAT("%%s.%%i", gscv.id, gscv.version) as scv_acxn, + IF(scv.local_key IS NULL, NULL, SPLIT(scv.local_key, "|")[0]) as local_key, + scv.local_key as local_key_orig, + scv.date_created as first_in_clinvar, + scv.classification_comment, + COUNT(IFNULL(gscv.lab_id,gscv.lab_name)) as case_count + FROM `%s.gc_scv` gscv + JOIN `%s.scv_summary` scv + ON + scv.id = gscv.id + WHERE + -- these are the dupe gc submissions that are older + gscv.id NOT IN ( + "SCV000607136","SCV000986740", + "SCV000986708","SCV000986786", + "SCV000986705","SCV000986788", + "SCV000986813","SCV000607109" + ) + GROUP BY + scv.submitter_id, + scv.variation_id, + gscv.id, + gscv.version, + scv.local_key, + scv.date_created, + scv.classification_comment """, schema_name, schema_name); -- gc scv w/ agg info for current release EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TEMP TABLE gc AS - select - gc_scv.submitter_id, - gc_scv.variation_id, - g.hgnc_id, - g.symbol, - v.name, - cvcv.agg_classification, - cvcv.rank, - gc_scv.scv_acxn, - gc_scv.local_key, - gc_scv.case_count, - gc_scv.first_in_clinvar, - gc_scv.classification_comment - from gc_scv - join `%s.variation` v - on - v.id = gc_scv.variation_id - left join `%s.single_gene_variation` sgv - on - sgv.variation_id = gc_scv.variation_id - left join `%s.gene` g - on - sgv.gene_id = g.id - join `clinvar_ingest.clinvar_vcvs` cvcv - on - cvcv.variation_id = gc_scv.variation_id - and - %T between cvcv.start_release_date and cvcv.end_release_date - """, schema_name, schema_name, schema_name, release_date); + SELECT + gc_scv.submitter_id, + gc_scv.variation_id, + g.hgnc_id, + g.symbol, + v.name, + vcv.interp_description as agg_classification, + cvs1.rank + gc_scv.scv_acxn, + gc_scv.local_key, + gc_scv.case_count, + gc_scv.first_in_clinvar, + gc_scv.classification_comment + FROM gc_scv + JOIN `%s.variation` v + ON + v.id = gc_scv.variation_id + LEFT JOIN `%s.single_gene_variation` sgv + ON + sgv.variation_id = gc_scv.variation_id + LEFT JOIN `%s.gene` g + ON + sgv.gene_id = g.id + JOIN `%s.variation_archive` vcv + ON + vcv.variation_id = gc_scv.variation_id + LEFT JOIN `clinvar_ingest.clinvar_status` cvs1 + ON + cvs1.label = vcv.review_status + """, schema_name, schema_name, schema_name, schema_name); -- gc case info for current release EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TEMP TABLE gc_case AS - select - gscv.variation_id, - gc_scv.scv_acxn, - gc_scv.local_key, - gc_scv.local_key_orig, - gscv.lab_name, - gscv.lab_id, - gscv.lab_classification, - gscv.lab_classif_type, - gscv.lab_date_reported, - gscv.sample_id, - IF(gscv.sample_id IS NULL, gc_scv.local_key, CONCAT(gc_scv.local_key, "|", gscv.sample_id)) as case_report_key - from gc_scv - join `%s.gc_scv` gscv - on - gc_scv.id = gscv.id + SELECT + gscv.variation_id, + gc_scv.scv_acxn, + gc_scv.local_key, + gc_scv.local_key_orig, + gscv.lab_name, + gscv.lab_id, + gscv.lab_classification, + gscv.lab_classif_type, + gscv.lab_date_reported, + gscv.sample_id, + IF( + gscv.sample_id IS NULL, + gc_scv.local_key, + CONCAT(gc_scv.local_key, "|", gscv.sample_id) + ) as case_report_key + FROM gc_scv + JOIN `%s.gc_scv` gscv + ON + gc_scv.id = gscv.id """, schema_name); -- gc case related lab info fo current release EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TEMP TABLE lab_case AS - select - gc_case.variation_id, - gc_case.lab_id as submitter_id, - gc_case.case_report_key, - STRING_AGG(DISTINCT FORMAT("%%s.%%i", lab_scv.id, lab_scv.version)) as acxn, - STRING_AGG(DISTINCT lab_scv.classif_type ORDER BY lab_scv.classif_type) as classif_type, - STRING_AGG(DISTINCT lab_scv.submitted_classification ORDER BY lab_scv.submitted_classification) as classification, - MIN(lab_scv.last_evaluated) as last_evaluated, - MIN(lab_scv.date_created) as first_in_clinvar, - COUNT(DISTINCT lab_scv.id) as scv_count - from gc_case - left join `%s.scv_summary` lab_scv - on - lab_scv.submitter_id = gc_case.lab_id and - lab_scv.variation_id = gc_case.variation_id - group by - gc_case.lab_id, - gc_case.variation_id, - gc_case.case_report_key + SELECT + gc_case.variation_id, + gc_case.lab_id as submitter_id, + gc_case.case_report_key, + STRING_AGG(DISTINCT FORMAT("%%s.%%i", lab_scv.id, lab_scv.version)) as acxn, + STRING_AGG(DISTINCT lab_scv.classif_type ORDER BY lab_scv.classif_type) as classif_type, + STRING_AGG(DISTINCT lab_scv.submitted_classification ORDER BY lab_scv.submitted_classification) as classification, + MIN(lab_scv.last_evaluated) as last_evaluated, + MIN(lab_scv.date_created) as first_in_clinvar, + COUNT(DISTINCT lab_scv.id) as scv_count + FROM gc_case + LEFT JOIN `%s.scv_summary` lab_scv + ON + lab_scv.submitter_id = gc_case.lab_id and + lab_scv.variation_id = gc_case.variation_id + GROUP BY + gc_case.lab_id, + gc_case.variation_id, + gc_case.case_report_key """, schema_name); -- gc var report EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TEMP TABLE var AS - WITH v AS - ( - select - gc_scv.variation_id, - COUNT(gc_scv.id) as gc_scv_count, - MIN(vcv.date_created) as first_in_clinvar - from gc_scv - join `%s.variation_archive` vcv - on - vcv.variation_id = gc_scv.variation_id - group by - gc_scv.variation_id - ) - -- variation data related to single GC submitter's submissions - select - v.variation_id, - v.first_in_clinvar, - COUNT(distinct sgrp.id) as scv_count, - v.gc_scv_count, - STRING_AGG(split( sgrp.scv_label, "%%")[0]||"%%", "\\n" ORDER BY sgrp.rank desc, sgrp.scv_group_type, sgrp.scv_label) as all_scvs - from v - join `clinvar_ingest.voi_scv_group` sgrp - on - sgrp.variation_id = v.variation_id and - %T between sgrp.start_release_date and sgrp.end_release_date - group by - v.variation_id, - v.first_in_clinvar, - v.gc_scv_count + WITH v AS + ( + SELECT + gc_scv.variation_id, + COUNT(gc_scv.id) as gc_scv_count, + MIN(vcv.date_created) as first_in_clinvar + FROM gc_scv + JOIN `%s.variation_archive` vcv + ON + vcv.variation_id = gc_scv.variation_id + GROUP BY + gc_scv.variation_id + ) + -- variation data related to single GC submitter's submissions + SELECT + v.variation_id, + v.first_in_clinvar, + COUNT(distinct sgrp.id) as scv_count, + v.gc_scv_count, + STRING_AGG(split( sgrp.scv_label, "%%")[0]||"%%", "\\n" ORDER BY sgrp.rank desc, sgrp.scv_group_type, sgrp.scv_label) as all_scvs + FROM v + JOIN `clinvar_ingest.voi_scv_group` sgrp + ON + sgrp.variation_id = v.variation_id + AND + %T between sgrp.start_release_date and sgrp.end_release_date + GROUP BY + v.variation_id, + v.first_in_clinvar, + v.gc_scv_count """, schema_name, release_date); -- gc variation report (1 of 2) - first remove all gc_variation records for the release_date being processed @@ -211,7 +224,7 @@ BEGIN only_other_gc_submitters ) -- variant-centric output for single GC submitter - select + SELECT %T as report_date, gc.submitter_id, gc.variation_id, @@ -233,14 +246,13 @@ BEGIN IF((var.first_in_clinvar = gc.first_in_clinvar), "Yes", "No") as novel_at_first_gc_submission, IF((var.scv_count = 1), "Yes", "No") as novel_as_of_report_run_date, IF((var.scv_count > 1 AND var.scv_count = var.gc_scv_count), "Yes", "No") as only_other_gc_submitters - from gc - left join vcep - on + FROM gc + LEFT JOIN vcep + ON vcep.variation_id = gc.variation_id - left join var - on + LEFT JOIN var + ON var.variation_id = gc.variation_id - -- ORDER BY 1, CAST(gc.variation_id as INT) """, release_date); -- gc case report (1 of 2) - first remove all gc_case records for the release_date being processed @@ -253,16 +265,36 @@ BEGIN EXECUTE IMMEDIATE FORMAT(""" INSERT INTO `variation_tracker.gc_case` ( - report_date,submitter_id,variation_id,gene_id,gene_symbol,variant_name, - ep_name,ep_classification, ep_classif_type,ep_last_evaluated_date, - case_report_lab_name,case_report_lab_id,case_report_lab_classification, - case_report_lab_classif_type,case_report_lab_date_reported, - gc_scv_acxn,gc_scv_first_in_clinvar,gc_scv_local_key,case_report_sample_id, - lab_scv_classification,lab_scv_classif_type,lab_scv_last_evaluated, - lab_scv_first_in_clinvar,lab_scv_before_gc_scv,lab_scv_in_clinvar_as_of_release, - ep_diff_alert,lab_diff_alert,classification_comment + report_date, + submitter_id, + variation_id, + gene_id, + gene_symbol, + variant_name, + ep_name, + ep_classification, + ep_classif_type, + ep_last_evaluated_date, + case_report_lab_name, + case_report_lab_id, + case_report_lab_classification, + case_report_lab_classif_type, + case_report_lab_date_reported, + gc_scv_acxn, + gc_scv_first_in_clinvar, + gc_scv_local_key, + case_report_sample_id, + lab_scv_classification, + lab_scv_classif_type, + lab_scv_last_evaluated, + lab_scv_first_in_clinvar, + lab_scv_before_gc_scv, + lab_scv_in_clinvar_as_of_release, + ep_diff_alert, + lab_diff_alert, + classification_comment ) - select + SELECT %T as report_date, gc.submitter_id, gc.variation_id, @@ -292,62 +324,63 @@ BEGIN IF(lab_case.scv_count=1,lab_case.first_in_clinvar, null) as lab_scv_first_in_clinvar, -- show error if more than 1 scv exists on variant for case report submitter CASE lab_case.scv_count - WHEN 0 THEN - null - WHEN 1 THEN - IF(gc.first_in_clinvar <= lab_case.first_in_clinvar, "No", "Yes") - ELSE - "Error: multiple lab scvs." - END as lab_scv_before_gc_scv, + WHEN 0 THEN + null + WHEN 1 THEN + IF(gc.first_in_clinvar <= lab_case.first_in_clinvar, "No", "Yes") + ELSE + "Error: multiple lab scvs." + END as lab_scv_before_gc_scv, -- is lab_case.scv_count = 1 then the lab scv is submitted at time of clinvar release, error if more than one scv from lab in release CASE lab_case.scv_count - WHEN 0 THEN - null - WHEN 1 THEN - "Yes" - ELSE - "Error: multiple lab scvs." - END as lab_scv_in_clinvar_as_of_release, + WHEN 0 THEN + null + WHEN 1 THEN + "Yes" + ELSE + "Error: multiple lab scvs." + END as lab_scv_in_clinvar_as_of_release, -- alert for VCEP diff, show null if no vcep scv or if VCEP classification exactly matches GC CASE report classification CASE - WHEN vcep.classif_type IS NULL THEN - null - WHEN (IFNULL(gc_case.lab_classif_type,"n/a") <> vcep.classif_type) THEN - FORMAT("%%s vs %%s (%%s)", - UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), - UPPER(vcep.classif_type), - IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > vcep.last_evaluated, "<",">")) - ) - ELSE - null - END as ep_diff_alert, + WHEN vcep.classif_type IS NULL THEN + null + WHEN (IFNULL(gc_case.lab_classif_type,"n/a") <> vcep.classif_type) THEN + FORMAT("%%s vs %%s (%%s)", + UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), + UPPER(vcep.classif_type), + IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > vcep.last_evaluated, "<",">")) + ) + ELSE + null + END as ep_diff_alert, -- alert for LAB diff, show null if no vcep scv or if LAB classification exactly matches GC CASE report classification -- show error if more than 1 scv exists on variant for case report submitter CASE - WHEN lab_case.scv_count=1 AND (IFNULL(gc_case.lab_classif_type,"n/a") <> lab_case.classif_type) THEN - FORMAT("%%s vs %%s (%%s)", - UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), - UPPER(lab_case.classif_type), - IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > lab_case.last_evaluated, "<",">")) - ) - WHEN lab_case.scv_count > 1 THEN - -- error - "Error: multiple lab scvs." - ELSE - -- lab_case count = 0 OR gc_case and lab_case classifications match so do nothing - null - END as lab_diff_alert, + WHEN lab_case.scv_count=1 AND (IFNULL(gc_case.lab_classif_type,"n/a") <> lab_case.classif_type) THEN + FORMAT("%%s vs %%s (%%s)", + UPPER(IFNULL(gc_case.lab_classif_type,"n/a")), + UPPER(lab_case.classif_type), + IF(IFNULL(gc_case.lab_date_reported,vcep.last_evaluated) is NULL, "?",IF(gc_case.lab_date_reported > lab_case.last_evaluated, "<",">")) + ) + WHEN lab_case.scv_count > 1 THEN + -- error + "Error: multiple lab scvs." + ELSE + -- lab_case count = 0 OR gc_case and lab_case classifications match so do nothing + null + END as lab_diff_alert, gc.classification_comment - from gc - left join vcep - on + FROM gc + LEFT JOIN vcep + ON vcep.variation_id = gc.variation_id - left join gc_case - on + LEFT JOIN gc_case + ON gc.scv_acxn = gc_case.scv_acxn - left join lab_case - on - lab_case.variation_id = gc_case.variation_id and + LEFT JOIN lab_case + ON + lab_case.variation_id = gc_case.variation_id + AND lab_case.case_report_key = gc_case.case_report_key """, release_date); diff --git a/scripts/tracker-report-update/initialize-tracker-tables.sql b/scripts/tracker-report-update/initialize-tracker-tables.sql index d4ca7b4..402d6cc 100644 --- a/scripts/tracker-report-update/initialize-tracker-tables.sql +++ b/scripts/tracker-report-update/initialize-tracker-tables.sql @@ -58,7 +58,8 @@ CREATE OR REPLACE TABLE `variation_tracker.gc_case` ) ; -CREATE OR REPLACE TABLE `variation_tracker.alert_type` AS +CREATE OR REPLACE TABLE `variation_tracker.alert_type` +AS SELECT * FROM UNNEST([ STRUCT(NULL AS sort_order, '' AS label), (0, 'Out of Date'), diff --git a/scripts/tracker-report-update/tracker-report-update-proc.sql b/scripts/tracker-report-update/tracker-report-update-proc.sql index 6130f4a..aac8e77 100644 --- a/scripts/tracker-report-update/tracker-report-update-proc.sql +++ b/scripts/tracker-report-update/tracker-report-update-proc.sql @@ -1,18 +1,6 @@ -CREATE OR REPLACE PROCEDURE `clinvar_ingest.tracker_report_update`( - on_date DATE -) +CREATE OR REPLACE PROCEDURE `clinvar_ingest.tracker_report_update`() BEGIN - FOR rec IN ( - select - s.schema_name, - s.release_date, - s.prev_release_date, - s.next_release_date - FROM clinvar_ingest.schema_on(on_date) as s - ) - DO - CALL `variation_tracker.report_variation_proc`(); - CALL `variation_tracker.tracker_reports_rebuild`(); - CALL `variation_tracker.gc_tracker_report_rebuild`(rec.scheman_name, rec.release_date); - END FOR; + CALL `variation_tracker.report_variation_proc`(); + CALL `variation_tracker.tracker_reports_rebuild`(); + CALL `variation_tracker.gc_tracker_report_rebuild`(); END; \ No newline at end of file diff --git a/scripts/tracker-report-update/tracker-report-update-v2-proc.sql b/scripts/tracker-report-update/tracker-report-update-v2-proc.sql new file mode 100644 index 0000000..9187fc7 --- /dev/null +++ b/scripts/tracker-report-update/tracker-report-update-v2-proc.sql @@ -0,0 +1,6 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.tracker_report_update_v2`() +BEGIN + CALL `variation_tracker.report_variation_proc`(); + CALL `variation_tracker.tracker_reports_rebuild_v2`(); + CALL `variation_tracker.gc_tracker_report_rebuild`(); +END; \ No newline at end of file