From f9ad2a9e1e5c1c3bd4da8b97a8b4e576b32bfcdf Mon Sep 17 00:00:00 2001 From: Lawrence Babb Date: Mon, 9 Dec 2024 09:10:16 -0500 Subject: [PATCH] refactor clinvar-ingest dataset prep procs with versioning --- .../00-setup-translation-tables.sql | 174 +++++++++ .../01-scv-summary-proc.sql | 162 -------- .../01-validate-ds-terms-v1-proc.sql | 51 +++ .../02-normalize-ds-v2-proc.sql | 192 +++++++++ .../02-single-gene-variation-proc.sql | 348 ----------------- .../03-scv-summary-v1-proc.sql | 158 ++++++++ .../03-scv-summary-v2-proc.sql | 134 +++++++ .../04-single-gene-variation-v1-proc.sql | 363 ++++++++++++++++++ ...-gc-scv-proc.sql => 05-gc-scv-v1-proc.sql} | 16 +- .../dataset-preparation-v1-proc.sql | 20 + .../dataset-preparation-v2-proc.sql | 21 + .../general/schemas-on-or-after-v2-func.sql | 2 +- .../general/tables-columns-exists-proc.sql | 30 ++ 13 files changed, 1152 insertions(+), 519 deletions(-) create mode 100644 scripts/ds-staging-table-procs/00-setup-translation-tables.sql delete mode 100644 scripts/ds-staging-table-procs/01-scv-summary-proc.sql create mode 100644 scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql create mode 100644 scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql delete mode 100644 scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql create mode 100644 scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql create mode 100644 scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql create mode 100644 scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql rename scripts/ds-staging-table-procs/{03-gc-scv-proc.sql => 05-gc-scv-v1-proc.sql} (85%) create mode 100644 scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql create mode 100644 scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql create mode 100644 scripts/general/tables-columns-exists-proc.sql diff --git a/scripts/ds-staging-table-procs/00-setup-translation-tables.sql b/scripts/ds-staging-table-procs/00-setup-translation-tables.sql new file mode 100644 index 0000000..4405c64 --- /dev/null +++ b/scripts/ds-staging-table-procs/00-setup-translation-tables.sql @@ -0,0 +1,174 @@ +-- CREATE OR REPLACE TABLE `clinvar_ingest.cvc_context_types` (code STRING, label STRING, display_order INT64); + +-- INSERT INTO `clinvar_ingest.cvc_context_types` (code, label, display_order) +-- VALUES +-- ('gd','Germline Disease', 10), +-- ('sc','Somatic Cancer', 20), +-- ('pg','Pharmacogenomic', 30), +-- ('ot','Other', 40); + +CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_clinsig_types` ( + code STRING, label STRING, significance INT64, + clinvar_prop_type STRING, clinvar_code_order INT64, + clinvar_desc_order INT64, cvc_prop_type STRING, + cvc_code_order INT64, cvc_desc_order INT64, + direction STRING, strength_code STRING, strength_label STRING, + classification_code STRING, penetrance_level STRING +); +INSERT INTO `clinvar_ingest.clinvar_clinsig_types` ( + code, + label, + significance, + clinvar_prop_type, + clinvar_code_order, + clinvar_desc_order, + cvc_prop_type, + cvc_code_order, + cvc_desc_order, + direction, + strength_code, + strength_label, + classification_code, + penetrance_level +) +VALUES + -- Pathogenic statements + ('b', 'Benign', 0, '?', 30, 30, '?', 30, 30, 'refutes', 'cg000101', 'definitive', 'cg000001', null), + ('lb', 'Likely benign', 0, '?', 31, 31, '?', 31, 31, 'refutes', 'cg000102', 'likely', 'cg000002', null), + ('b/lb', 'Benign/Likely benign', 0, '?', 32, 32, '?', 32, 32, 'refutes', 'cg000102', 'likely', 'cg000003', null), + ('vus', 'Uncertain significance', 1, '?', 20, 20, '?', 20, 20, 'none', 'cg000103', 'inconclusive', 'cg000004', null), + ('ura', 'Uncertain risk allele', 1, '?', 21, 21, '?', 21, 21, 'none', 'cg000103', 'inconclusive', 'cg000005', 'risk allele'), + ('p', 'Pathogenic', 2, '?', 10, 10, '?', 10, 10, 'supports', 'cg000101', 'definitive', 'cg000006', null), + ('lp', 'Likely pathogenic', 2, '?', 11, 11, '?', 11, 11, 'supports', 'cg000102', 'likely', 'cg000007', null), + ('p/lp', 'Pathogenic/Likely pathogenic', 2, '?', 12, 12, '?', 12, 12, 'supports', 'cg000102', 'likely', 'cg000008', null), + ('p-lp', 'Pathogenic, low penetrance', 2, '?', 13, 13, '?', 13, 13, 'supports', 'cg000101', 'definitive', 'cg000009', 'low'), + ('lp-lp', 'Likely pathogenic, low penetrance', 2, '?', 14, 14, '?', 14, 14, 'supports', 'cg000102', 'likely', 'cg000010', 'low'), + ('era', 'Established risk allele', 2, '?', 15, 15, '?', 15, 15, 'supports', 'cg000101', 'definitive', 'cg000011', 'risk allele'), + ('lra', 'Likely risk allele', 2, '?', 16, 16, '?', 16, 16, 'supports', 'cg000102', 'likely', 'cg000012', 'risk allele'), + ('cdfs', 'conflicting data from submitters', 1, '?', 40, 40, '?', 40, 40, 'none', 'cg000103', 'inconclusive', 'cg000013', null), + -- ClinVarDrugResponse statements + ('dr', 'drug response', 2, 'dr', 130, 130, 'dr', 130, 130, 'supports', 'cg000100', 'not specified', 'cg000014', null), + -- ClinVarNonAssertion statements + ('np', 'not provided', 0, '?', 140, 140, 'oth', 140, 140, 'none', 'cg000100', 'not specified', 'cg000015', null), + -- ClinVarOther statements + ('rf', 'risk factor', 2, 'rf', 170, 170, 'oth', 170, 170, 'none', 'cg000100', 'not specified', 'cg000016', null), + ('aff', 'Affects', 2, 'aff', 100, 100, 'oth', 100, 100, 'none', 'cg000100', 'not specified', 'cg000017', null), + ('assoc', 'association', 2, 'assoc', 110, 110, 'oth', 110, 110, 'none', 'cg000100', 'not specified', 'cg000018', null), + ('assocnf', 'association not found', 0, 'assoc', 111, 111, 'oth', 111, 111, 'none', 'cg000100', 'not specified', 'cg000019', null), + ('cs', 'confers sensitivity', 2, 'cs', 120, 120, 'oth', 120, 120, 'none', 'cg000100', 'not specified', 'cg000020', null), + ('oth', 'other', 0, 'oth', 150, 150, 'oth', 150, 150, 'none', 'cg000100', 'not specified', 'cg000021', null), + ('protect', 'protective', 0, 'protect', 160, 160, 'oth', 160, 160, 'none', 'cg000100', 'not specified', 'cg000022', null), + -- SomaticImpact + ('t1', 'Tier I - Strong', 2, 'somatic', 10, 10, 'somatic', 10, 10, 'supports', 'cg000100', 'definitive', 'cg000023', null), + ('t2', 'Tier II - Potential', 2, 'somatic', 11, 11, 'somatic', 11, 11, 'supports', 'cg000102', 'likely', 'cg000024', null), + ('t3', 'Tier III - Unknown', 1, 'somatic', 20, 20, 'somatic', 20, 20, 'none', 'cg000103', 'inconclusive', 'cg000025', null), + ('t4', 'Tier IV - Benign/Likely benign', 0, 'somatic', 32, 32, 'somatic', 32, 32, 'refutes', 'cg000102', 'likely', 'cg000026', null), + -- Oncogenic + ('o', 'Oncogenic', 2, '?', 10, 10, 'onco', 10, 10, 'supports', 'cg000101', 'definitive', 'cg000027', null), + ('lo', 'Likely oncogenic', 2, '?', 11, 11, 'onco', 11, 11, 'supports', 'cg000102', 'likely', 'cg000028', null); + + +CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_proposition_types` (code STRING, label STRING, display_order INT64); +INSERT INTO `clinvar_ingest.clinvar_proposition_types` (code, label, display_order) +VALUES + ('path', 'Pathogenicity', 10), + ('dr', 'DrugResponse', 11), + ('oth', 'Other', 12), + ('somatic', 'SomaticClinicalImpact', 20), + ('onco', 'Oncogenicity', 30); + + -- ('rf', 'RiskFactor',30), + -- ('np', 'NotProvided',60), + -- put affects, associated, confresSensitivity and protect in other category + -- ('aff', 'Affects',20), + -- ('assoc','Associated',30), + -- ('cs', 'ConfersSensitivity',40), + -- ('protect','Protective',80), + + +CREATE OR REPLACE TABLE `clinvar_ingest.scv_clinsig_map` (scv_term STRING, cv_clinsig_type STRING); +INSERT INTO `clinvar_ingest.scv_clinsig_map` (scv_term, cv_clinsig_type) VALUES + ('affects', 'aff'), + ('associated with leiomyomas', 'np'), + ('association', 'assoc'), + ('association not found', 'assocnf'), + ('benign', 'b'), + ('benign/likely benign', 'b/lb'), + ('cancer', 'oth'), + ('confers sensitivity', 'cs'), + ('conflicting data from submitters', 'cdfs'), + ('drug response', 'dr'), + ('drug-response', 'dr'), + ('established risk allele', 'era'), + ('likely benign', 'lb'), + ('likely oncogenic', 'lo'), + ('likely pathogenic', 'lp'), + ('likely pathogenic - adrenal bilateral pheochromocy', 'lp'), + ('likely pathogenic - adrenal pheochromocytoma', 'lp'), + ('likely pathogenic, low penetrance', 'lp-lp'), + ('likely risk allele', 'lra'), + ('moderate', 'p'), + ('mut', 'p'), + ('mutation', 'p'), + ('no known pathogenicity', 'b'), + ('non-pathogenic', 'b'), + ('not provided', 'np'), + ('oncogenic', 'o'), + ('other', 'oth'), + ('pathogenic', 'p'), + ('pathogenic, low penetrance', 'p-lp'), + ('pathogenic variant for bardet-biedl syndrome', 'p'), + ('pathogenic/likely pathogenic', 'p/lp'), + ('pathologic', 'p'), + ('poly', 'b'), + ('probable-non-pathogenic', 'lb'), + ('probable-pathogenic', 'lp'), + ('probably not pathogenic', 'lb'), + ('probably pathogenic', 'lp'), + ('protective', 'protect'), + ('risk factor', 'rf'), + ('suspected benign', 'lb'), + ('suspected pathogenic', 'lp'), + ('tier i - strong', 't1'), + ('tier ii - potential', 't2'), + ('tier iii - unknown', 't3'), + ('tier iv - benign/likely benign', 't4'), + ('uncertain', 'vus'), + ('uncertain risk allele', 'ura'), + ('uncertain significance', 'vus'), + ('unknown', 'vus'), + ('unknown significance', 'vus'), + ('untested', 'np'), + ('variant of unknown significance', 'vus'), + ('vsb', 'lb'), + ('vlb', 'lb'), + ('vous', 'vus'), + ('vus', 'vus'), + ('vlm', 'lp'), + ('uncertain significance: likely benign', 'vus'), + ('uncertain significance: likely pathogenic','vus'), + ('na','oth'); + +CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_status` (rank INT64, label STRING, scv BOOL); +INSERT INTO `clinvar_ingest.clinvar_status` (rank, label, scv) +VALUES + (-3, 'no classifications from unflagged records', FALSE), + (-3, 'flagged submission', TRUE), + (-2, 'no interpretation for the single variant', FALSE), + (-2, 'no classification for the single variant', FALSE), + (-1, 'no assertion provided', TRUE), + (-1, 'no classification provided', TRUE), + (-1, 'not classified by submitter', FALSE), + (0, 'no assertion criteria provided', TRUE), + (1, 'criteria provided, single submitter', TRUE), + (1, 'classified by single submitter', FALSE), + (1, 'criteria provided, conflicting interpretations', FALSE), + (1, 'criteria provided, conflicting classifications', FALSE), + (2, 'criteria provided, multiple submitters, no conflicts', FALSE), + (2, 'criteria provided, multiple submitters', FALSE), + (2, 'classified by multiple submitters', FALSE), + (3, 'reviewed by expert panel', TRUE), + (4, 'practice guideline', TRUE), + (4, 'reviewed by professional society', FALSE) ; + + \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/01-scv-summary-proc.sql b/scripts/ds-staging-table-procs/01-scv-summary-proc.sql deleted file mode 100644 index 7daefe0..0000000 --- a/scripts/ds-staging-table-procs/01-scv-summary-proc.sql +++ /dev/null @@ -1,162 +0,0 @@ -CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary_proc`(start_with DATE) -BEGIN - - FOR rec IN (select s.schema_name, s.release_date, s.prev_release_date, s.next_release_date FROM clinvar_ingest.schemas_on_or_after(start_with) as s) - DO - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.scv_summary` AS - WITH obs_sample AS ( - SELECT - REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, - `clinvar_ingest.parseSample`(obs.content) as s - FROM - `%s.clinical_assertion_observation` obs - ), - obs_method AS ( - SELECT - REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, - m.description as method_desc, - m.method_type - FROM - `%s.clinical_assertion_observation` obs, - UNNEST(`clinvar_ingest.parseMethods`(obs.content)) as m - - ), - assertion_method AS ( - SELECT - ca.id, - a.attribute.type, - a.attribute.value, - STRING_AGG(DISTINCT c.url,';') as url - FROM `%s.clinical_assertion` ca, - UNNEST(`clinvar_ingest.parseAttributeSet`(ca.content)) as a, - UNNEST(a.citation) as c - WHERE - a.attribute.type = "AssertionMethod" - and - c.url is not null - GROUP BY - ca.id, - a.attribute.type, - a.attribute.value - ), - obs AS ( - SELECT - ca.id, - STRING_AGG(DISTINCT os.s.origin, ", " ORDER BY os.s.origin) as origin, - STRING_AGG(DISTINCT os.s.affected_status, ", " ORDER BY os.s.affected_status) as affected_status, - STRING_AGG(DISTINCT om.method_desc, ", " ORDER BY om.method_desc) as method_desc, - STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type - FROM - `%s.clinical_assertion` ca - LEFT JOIN obs_sample os ON os.id = ca.id - LEFT JOIN obs_method om ON om.id = ca.id - GROUP BY - ca.id - ), - scv_classification_comment AS ( - SELECT - id, - STRING_AGG(JSON_EXTRACT_SCALAR(c, r'$.text'), '\\n') as text - FROM - `%s.clinical_assertion` ca - LEFT JOIN UNNEST(ca.interpretation_comments) as c - WHERE ARRAY_LENGTH(ca.interpretation_comments) > 0 - GROUP BY - id - ), - scv AS ( - SELECT - ca.release_date as release_date, - ca.id, - ca.version, - ca.variation_id, - ca.local_key, - ca.interpretation_date_last_evaluated as last_evaluated, - cvs.rank, - ca.review_status, - cst.clinvar_prop_type as clinvar_stmt_type, - cst.cvc_prop_type as cvc_stmt_type, - ca.interpretation_description as submitted_classification, - IFNULL(map.cv_clinsig_type, '-') as classif_type, - cst.significance, - ca.submitter_id, - ca.submission_id, - ca.clinical_assertion_observation_ids, - `clinvar_ingest.parseComments`(ca.content) as comments, - scc.text as classification_comment, - ca.date_created, - ca.date_last_updated - FROM - `%s.clinical_assertion` ca - LEFT JOIN scv_classification_comment scc - ON - scc.id = ca.id - LEFT JOIN `clinvar_ingest.scv_clinsig_map` map - ON - map.scv_term = lower(IF(ca.id IS NULL, NULL, IFNULL(ca.interpretation_description,'not provided'))) - LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cst - ON - cst.code = map.cv_clinsig_type - LEFT JOIN `clinvar_ingest.clinvar_status` cvs - ON - cvs.label = ca.review_status - ) - SELECT - scv.release_date, - scv.id, - scv.version, - scv.variation_id, - scv.local_key, - scv.last_evaluated, - scv.rank, - scv.review_status, - scv.clinvar_stmt_type, - scv.cvc_stmt_type, - scv.submitted_classification, - scv.classif_type, - scv.significance, - scv.comments, - scv.classification_comment, - scv.date_created, - scv.date_last_updated, - scv.clinical_assertion_observation_ids, - scv.submitter_id, - subm.submission_date, - obs.origin, - obs.affected_status, - obs.method_desc, - obs.method_type, - am.value as assertion_method, - am.url as assertion_method_url - FROM scv - LEFT JOIN obs - ON - obs.id = scv.id - LEFT JOIN assertion_method am - ON - am.id = scv.id - LEFT JOIN `%s.submission` subm - ON - subm.id = scv.submission_id - """, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name); - - END FOR; - -END; - - --- repair any null review_status values --- set any NULL review_status either 'no assertion provided' (if interp_desc is null or not provided) or 'no assertion criteria provided' (otherwise) --- --- UPDATE `%s.clinical_assertion` ca --- SET --- ca.review_status = --- IF( --- ca.interpretation_description is null OR --- ca.interpretation_description = 'not provided', --- 'no assertion provided', --- 'no assertion criteria provided' --- ) --- WHERE ca.review_status IS NULL --- ; \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql b/scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql new file mode 100644 index 0000000..36d3c72 --- /dev/null +++ b/scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql @@ -0,0 +1,51 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.validate_dataset_terms_v1`( + schema_name STRING +) +BEGIN + -- Declare variables to hold results and error messages + DECLARE scv_classification_terms ARRAY; + DECLARE scv_review_status_terms ARRAY; + DECLARE combined_issues STRING; + + -- Check for new interpretation_descriptions in clinical_assertion + EXECUTE IMMEDIATE FORMAT(""" + SELECT ARRAY_AGG(DISTINCT ca.interpretation_description) + FROM `%s.clinical_assertion` ca + LEFT JOIN `clinvar_ingest.scv_clinsig_map` map + ON map.scv_term = LOWER(IFNULL(ca.interpretation_description,'not provided')) + WHERE map.scv_term IS NULL + """, schema_name) INTO scv_classification_terms; + + -- Check for new review_status terms in clinical_assertion + EXECUTE IMMEDIATE FORMAT(""" + SELECT ARRAY_AGG(DISTINCT ca.review_status) + FROM `%s.clinical_assertion` ca + LEFT JOIN `clinvar_ingest.clinvar_status` cs + ON cs.label = LOWER(ca.review_status) + WHERE cs.label IS NULL + """, schema_name) INTO scv_review_status_terms; + + -- Construct a combined error message if there are issues + SET combined_issues = ''; + + IF scv_classification_terms IS NOT NULL AND ARRAY_LENGTH(scv_classification_terms) > 0 THEN + SET combined_issues = FORMAT(""" + New SCV classification terms found: [%s]. + NOTE: Add scv_clinsig_map records to the '00-setup-translation-tables.sql' script and update, then rerun this script. + """, ARRAY_TO_STRING(scv_classification_terms, ', ')); + END IF; + + IF scv_review_status_terms IS NOT NULL AND ARRAY_LENGTH(scv_review_status_terms) > 0 THEN + SET combined_issues = FORMAT(""" + %s + New SCV review status terms found: [%s]. + NOTE: Add clinvar_status records to the '00-setup-translation-tables.sql' script and update, then rerun this script. + """, combined_issues, ARRAY_TO_STRING(scv_review_status_terms, ', ')); + END IF; + + -- Raise a single exception if there are any issues + IF combined_issues != '' THEN + RAISE USING message = combined_issues; + END IF; + +END; diff --git a/scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql b/scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql new file mode 100644 index 0000000..6da57dd --- /dev/null +++ b/scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql @@ -0,0 +1,192 @@ + +CREATE OR REPLACE PROCEDURE `clinvar_ingest.normalize_dataset_v2`( + schema_name STRING -- Name of schema/dataset +) +BEGIN + DECLARE column_exists BOOL; + DECLARE table_exists BOOL; + + -- TABLE 1. Clinical Assertion + -- check for clinical_assertion.statement_type column as THE indicator that the dataset has been normalized to v2 + CALL `clinvar_ingest.check_column_exists`(schema_name, 'clinical_assertion', 'statement_type', column_exists); + + -- if the column does not exist, add it with the default value + IF NOT column_exists THEN + -- backup the original clinical_assertion table + EXECUTE IMMEDIATE FORMAT(""" + CREATE TABLE `%s.backup_clinical_assertion` AS + SELECT * FROM `%s.clinical_assertion` + """, schema_name, schema_name); + + + -- create or replace the clinical_assertion table from the backup + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `%s.clinical_assertion` AS + SELECT + *, + 'GermlineClassification' as statement_type, + CAST(NULL as STRING) as clinical_impact_assertion_type, + CAST(NULL as STRING) as clinical_impact_clinical_significance + FROM `%s.backup_clinical_assertion` + """, schema_name, schema_name); + END IF; + + -- TABLE 2. RCV Accession & RCV Accession Classification (with corrections for v2 rcv_accession_classification.agg_classification column) + -- check that the rcv_accession_classification table exists as THE indicator that the dataset has been normalized to v2 + CALL `clinvar_ingest.check_table_exists`(schema_name, 'rcv_accession_classification', table_exists); + + -- if the table does not exist, create it + IF NOT table_exists THEN + -- backup the original rcv_accession table + EXECUTE IMMEDIATE FORMAT(""" + CREATE TABLE `%s.backup_rcv_accession` AS + SELECT * FROM `%s.rcv_accession` + """, schema_name, schema_name); + + -- create the rcv_accession_classification table from the backup + EXECUTE IMMEDIATE FORMAT(""" + CREATE TABLE %s.rcv_accession_classification AS + SELECT + release_date, + id as rcv_id, + 'GermlineClassification' AS statement_type, + review_status, + [ + STRUCT( + submission_count as num_submissions, + date_last_evaluated, + interpretation as interp_description, + CAST(NULL as STRING) as clinical_impact_assertion_type, + CAST(NULL as STRING) as clinical_impact_clinical_significance + ) + ] as agg_classification + FROM `%s.backup_rcv_accession` + """, schema_name, schema_name); + + -- create or replace the rcv_accession_classification table from the backup + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `%s.rcv_accession` AS + SELECT + release_date, + id, + variation_id, + independent_observations, + variation_archive_id, + version, + title, + trait_set_id, + content + FROM `%s.backup_rcv_accession` + """, schema_name, schema_name); + + ELSE + -- if the table exists, check if the agg_classification column exists + CALL `clinvar_ingest.check_column_exists`(schema_name, 'rcv_accession_classification', 'agg_classification', column_exists); + + -- if the column does not exist, convert the v2 table to the final format + IF NOT column_exists THEN + -- backup the original rcv_accession_classification table + EXECUTE IMMEDIATE FORMAT(""" + CREATE TABLE `%s.backup_rcv_accession_classification` AS + SELECT * FROM `%s.rcv_accession_classification` + """, schema_name, schema_name); + + -- create or replace the rcv_accession_classification table from the backup + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `%s.rcv_accession_classification` AS + SELECT + release_date, + rcv_id, + statement_type, + review_status, + `clinvar_ingest.parseAggDescription`(content).description as agg_classification, + IF( + REGEXP_CONTAINS(content, r'^{\\s*"Description"\\s*\\:\\s*"[^"]+"\\s*}'), + NULL, + REGEXP_REPLACE(content, r'"Description"\\s*\\:\\s*"[^"]+"\\s*,*\\s*', "") + ) as content + FROM `%s.rcv_accession_classification` + WHERE content is not null + UNION ALL + SELECT + release_date, + rcv_id, + statement_type, + review_status, + [ + STRUCT( + clinical_impact_assertion_type, + clinical_impact_clinical_significance, + date_last_evaluated, + num_submissions, + interp_description + ) + ] as agg_classification, + content + FROM `%s.rcv_accession_classification` + WHERE content is null + """, schema_name, schema_name, schema_name); + END IF; + END IF; + + -- TABLE 3. Variation Archive & Variation Archive Classification + -- check that the variation_archive_classification table exists as THE indicator that the dataset has been normalized to v2 + CALL `clinvar_ingest.check_table_exists`(schema_name, 'variation_archive_classification', table_exists); + + -- if the table does not exist, create it + IF NOT table_exists THEN + -- backup the original variation_archive table + EXECUTE IMMEDIATE FORMAT(""" + CREATE TABLE `%s.backup_variation_archive` AS + SELECT * FROM `%s.variation_archive` + """, schema_name, schema_name); + + -- create the variation_archive_classification table from the backup + EXECUTE IMMEDIATE FORMAT(""" + CREATE TABLE %s.variation_archive_classification AS + SELECT + id as vcv_id, + 'GermlineClassification' AS statement_type, + review_status, + num_submitters, + num_submissions, + date_created, + interp_date_last_evaluated, + interp_description, + interp_explanation, + CAST(JSON_VALUE(REPLACE(content, '@MostRecentSubmission', 'MostRecentSubmission'), '$.MostRecentSubmission') AS DATE) AS most_recent_submission, + interp_content as content, + CAST(NULL as STRING) as clinical_impact_assertion_type, + CAST(NULL as STRING) as clinical_impact_clinical_significance + FROM `%s.backup_variation_archive` + """, schema_name, schema_name); + + -- create or replace the variation_archive table from the backup + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `%s.variation_archive` AS + SELECT + date_created, + record_status, + variation_id, + release_date, + IF( + REGEXP_CONTAINS(content, r'^{\\s*"@MostRecentSubmission"\\s*\\:\\s*"[^"]+"\\s*}'), + NULL, + REGEXP_REPLACE(content, r'"@MostRecentSubmission"\\s*\\:\\s*"[^"]+"\\s*,*\\s*', "") + ) as content, + species, + id, + version, + num_submitters, + date_last_updated, + num_submissions + FROM `%s.backup_variation_archive` + """, schema_name, schema_name); + END IF; + +END; + +-- tested on older set and it added the col and updated the values to the default. +-- CALL `clinvar_ingest.normalize_dataset`('clinvar_2023_01_07_v1_6_57') +-- tested on newer set and no col was added or updated +-- CALL `clinvar_ingest.normalize_dataset`('clinvar_2024_11_26_v2_0_1_alpha') \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql b/scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql deleted file mode 100644 index 68b82a4..0000000 --- a/scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql +++ /dev/null @@ -1,348 +0,0 @@ --- see below for helper / setup scripts -CREATE OR REPLACE PROCEDURE `clinvar_ingest.single_gene_variation_proc`(start_with DATE) -BEGIN - - FOR rec IN (select s.schema_name, s.release_date, s.prev_release_date, s.next_release_date FROM clinvar_ingest.schemas_on_or_after(start_with) as s) - DO - - -- single gene variation plan - --- step 1. create a table with columns variation_id, gene_id, somatic_flag - --- where the variation_id is the pk. - EXECUTE IMMEDIATE FORMAT(""" - CREATE OR REPLACE TABLE `%s.single_gene_variation` - ( release_date DATE NOT NULL, - variation_id STRING NOT NULL, - gene_id STRING NOT NULL, - relationship_type STRING, - source STRING, - mane_select BOOL DEFAULT FALSE, - somatic BOOL DEFAULT FALSE) - """, rec.schema_name); - - -- create a temp table that is the list of remaining variations so as to reduce the query cost of analyzing against the variation table. - EXECUTE IMMEDIATE FORMAT(""" - CREATE TEMP TABLE _SESSION.temp_variation - AS - SELECT - v.id, - v.name, - v.descendant_ids, - v.subclass_type - FROM `%s.variation` v - """, rec.schema_name); - - - --- step 2. (Resolvable Gene Symbol in variation name takes precedence as "single gene for variant") - --- initialize the set with the extracted variation name gene symbols and the associated relationship info if available - - -- prioritize mane select transcripts in title of variant - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source, mane_select ) - SELECT DISTINCT - %T as release_date, - v.id as variation_id, - mane.gene_id, - IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, - IFNULL(ga.source,'cvc calculated') as source, - TRUE as mane_select - FROM `clinvar_ingest.mane_select_gene_transcript` mane - JOIN _SESSION.temp_variation v on STARTS_WITH(v.name, mane.transcript_id) - JOIN `clinvar_ingest.entrez_gene` g on g.gene_id = mane.gene_id - LEFT JOIN `%s.gene_association` ga on ga.gene_id = g.gene_id and ga.variation_id = v.id - """, rec.schema_name, rec.release_date, rec.schema_name); - - -- create a temp table that is the list of gene associations - -- for variations not yet in the single gene var table and - -- don't have the gene symbol ending in an -AS# suffix - EXECUTE IMMEDIATE FORMAT(""" - CREATE TEMP TABLE _SESSION.temp_gene_assoc - AS - SELECT DISTINCT - ga.variation_id, - ga.gene_id, - ga.relationship_type, - ga.source - FROM `%s.gene_association` ga - JOIN `clinvar_ingest.entrez_gene` g on ga.gene_id = g.gene_id - WHERE - NOT REGEXP_CONTAINS(g.symbol_from_authority, r'\\-AS\\d$') AND - NOT EXISTS ( - SELECT sgv.variation_id - FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = ga.variation_id - ) - """, rec.schema_name, rec.schema_name); - - -- NOTE: clinvar has a handful of duplicate gene records that can change over time - -- the plan will be to test the results of loading the variation_single_gene - -- table to verify that any variants with multiple genes are simply duplicates and - -- either one can be removed without an issue. - - -- clinvar perferred label hgvs-style format NM_0000.0(GENE):c.234... (not mane select but still in name) - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( - SELECT - v.id, - v.name, - REGEXP_EXTRACT(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') as symbol - FROM _SESSION.temp_variation v - WHERE REGEXP_CONTAINS(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') AND - NOT EXISTS ( - SELECT sgv.variation_id - FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id - ) - ) - SELECT - %T as release_date, - x.id as variation_id, - g.gene_id, - IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, - IFNULL(ga.source,'cvc calculated') as source - FROM x - JOIN `clinvar_ingest.entrez_gene` g on UPPER(g.symbol_from_authority) = UPPER(x.symbol) AND NOT REGEXP_CONTAINS(x.symbol, r'\\-AS\\d$') - LEFT JOIN _SESSION.temp_gene_assoc ga on ga.variation_id = x.id and ga.gene_id = g.gene_id - """, rec.schema_name, rec.schema_name, rec.release_date); - - - -- star allele format, CYP2C19*10. - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( - SELECT - v.id, - v.name, - REGEXP_EXTRACT(v.name, r'^([A-Za-z0-9\\-]+)[\\*\\,]') as symbol - FROM _SESSION.temp_variation v - WHERE REGEXP_CONTAINS(v.name, r'^([A-Za-z0-9\\-]+)[\\*\\,]') AND - NOT EXISTS ( - SELECT sgv.variation_id - FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id - ) - ) - SELECT - %T as release_date, - x.id as variation_id, - g.gene_id, - IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, - IFNULL(ga.source,'cvc calculated') as source - FROM x - JOIN `clinvar_ingest.entrez_gene` g ON UPPER(x.symbol) = UPPER(g.symbol_from_authority) - LEFT JOIN _SESSION.temp_gene_assoc ga ON x.id = ga.variation_id AND g.gene_id = ga.gene_id - """, rec.schema_name, rec.schema_name, rec.release_date); - - --- step 3. for any variations remaining... load all variations with any - --- genes that are mapped one-to-one from the gene association table - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( - SELECT - v.id - FROM _SESSION.temp_variation v - WHERE - NOT EXISTS ( - SELECT sgv.variation_id - FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id - ) - ) - SELECT - %T as release_date, - ga.variation_id, - STRING_AGG(ga.gene_id), - STRING_AGG(ga.relationship_type), - STRING_AGG(ga.source) - FROM x - JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id - group by ga.variation_id - having count(distinct ga.gene_id) = 1 - """, rec.schema_name, rec.schema_name, rec.release_date); - - --- step 4. for any variations remaining... load any variant with one submitted gene that - --- is not either "genes overlapped by variant" or "asserted, but not computed" - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source ) - WITH x AS - ( - SELECT - v.id - FROM _SESSION.temp_variation v - WHERE - NOT EXISTS ( - SELECT sgv.variation_id - FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id - ) - ) - SELECT - %T as release_date, - ga.variation_id, - STRING_AGG(ga.gene_id), - STRING_AGG(ga.relationship_type), - STRING_AGG(ga.source) - FROM x - JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id - WHERE - ga.relationship_type not in ('genes overlapped by variant' , 'asserted, but not computed') AND - ga.source = 'submitted' - GROUP BY ga.variation_id - HAVING count(distinct ga.gene_id) = 1 - """, rec.schema_name, rec.schema_name, rec.release_date); - - --- step 5. for any variations remaining... load any variations with a "within single gene" - --- as long as it is associated to only one gene for that variant - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source) - WITH x AS - ( - SELECT - v.id - FROM _SESSION.temp_variation v - WHERE - NOT EXISTS ( - SELECT sgv.variation_id - FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id - ) - ) - SELECT - %T as release_date, - ga.variation_id, - STRING_AGG(ga.gene_id), - STRING_AGG(ga.relationship_type), - STRING_AGG(ga.source) - FROM x - JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id - WHERE (ga.relationship_type = 'within single gene') - GROUP BY ga.variation_id - having count(ga.gene_id) = 1 - """, rec.schema_name, rec.schema_name, rec.release_date); - - --- last step. for any variations remaining... load any haplotype or genotype variations only if all the children have the same gene_id - EXECUTE IMMEDIATE FORMAT(""" - INSERT INTO `%s.single_gene_variation` - (release_date, variation_id, gene_id, relationship_type, source ) - WITH x AS - ( - SELECT - v.id - FROM _SESSION.temp_variation v - WHERE - NOT EXISTS ( - SELECT sgv.variation_id - FROM `%s.single_gene_variation` sgv - WHERE sgv.variation_id = v.id - ) - ) - SELECT - %T as release_date, - v.id as variation_id, - STRING_AGG(ga.gene_id) as gene_id, - 'association not provided by clinvar' as relationship_type, - 'cvc calculated' as source - FROM x - JOIN _SESSION.temp_variation v ON x.id = v.id - CROSS JOIN UNNEST(v.descendant_ids) AS descendant_id - JOIN _SESSION.temp_variation d ON - d.id = descendant_id AND - d.subclass_type='SimpleAllele' - LEFT JOIN `%s.gene_association` ga on ga.variation_id = d.id - WHERE ARRAY_LENGTH(v.descendant_ids) > 0 - GROUP BY v.id - HAVING COUNT(ga.gene_id) = 1 - """, rec.schema_name, rec.schema_name, rec.release_date, rec.schema_name); - - --- Finally, update somatic flags based on current onco-gene list (should this be a - --- 'change over time' capture of the onco-gene list's state?) - EXECUTE IMMEDIATE FORMAT(""" - UPDATE `%s.single_gene_variation` sgv - SET sgv.somatic = TRUE - WHERE EXISTS ( - SELECT cg.hgnc_id from `clinvar_ingest.cancer_genes` cg - join `clinvar_ingest.entrez_gene` g on g.hgnc_id = cg.hgnc_id - WHERE g.gene_id = sgv.gene_id - ) - """, rec.schema_name); - - DROP TABLE _SESSION.temp_variation; - DROP TABLE _SESSION.temp_gene_assoc; - - END FOR; - -END; - - - --- -- PRE 2019-07-01 data setup one-off --- -- remove duplicates from contrived gene_association table in pre-2019-07-01 data --- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.gene_association` --- AS --- SELECT x.release_date, x.gene_id, x.variation_id, x.relationship_type, x.source --- FROM ( --- SELECT --- release_date, gene_id, variation_id, relationship_type, source, --- ROW_NUMBER() --- OVER (PARTITION BY release_date, gene_id, variation_id) --- row_number --- FROM `clinvar_2019_06_01_v0.gene_association` --- ) x --- WHERE x.row_number = 1 --- ; - --- -- contrive a single_gene_variation table for the pre-2019-07-01 data --- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.single_gene_variation` --- AS --- WITH x AS --- ( --- select release_date, variation_id --- from `clinvar_2019_06_01_v0.gene_association` --- group by release_date, variation_id --- having count(*) = 1 --- ) --- SELECT --- x.release_date, --- ga.gene_id, --- x.variation_id, --- ga.relationship_type, --- ga.source, --- (cg.hgnc_id IS NOT NULL) as somatic, --- FALSE as mane_select --- FROM x --- join `clinvar_2019_06_01_v0.gene_association` ga on x.release_date = ga.release_date and x.variation_id = ga.variation_id --- left join `clinvar_2019_06_01_v0.gene` g on g.id = ga.gene_id and x.release_date = g.release_date --- left join `clinvar_ingest.cancer_genes` cg on g.hgnc_id = cg.hgnc_id --- ; - - - -- -- validate between steps?... - -- select vsg.* - -- from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg - -- join ( select vsg2.variation_id - -- from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg2 - -- group by vsg2.variation_id having count(distinct vsg2.gene_id) > 1) vsg2 on vsg.variation_id = vsg2.variation_id - -- order by 1,3; - - - -- -- Helper: To Find duplicate gene ids in release - -- select count(distinct ga.variation_id), ga.gene_id, g.symbol, g.hgnc_id - -- from `clinvar_2022_05_17_v1_6_46.gene_association` ga, - -- `clinvar_2022_05_17_v1_6_46.gene` g, - -- (SELECT ARRAY_AGG(id) as gene_ids - -- from `clinvar_2022_05_17_v1_6_46.gene` g - -- group by hgnc_id, symbol, release_date, full_name - -- having count(distinct id)>1 ) as gx, - -- UNNEST(gx.gene_ids) as gid - -- WHERE gid = ga.gene_id and g.id = ga.gene_id - -- group by ga.gene_id, g.symbol, g.hgnc_id - -- order by 3,1; \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql b/scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql new file mode 100644 index 0000000..33c290c --- /dev/null +++ b/scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql @@ -0,0 +1,158 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary_v1_proc`( + schema_name STRING +) +BEGIN + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `%s.scv_summary` AS + WITH obs_sample AS ( + SELECT + REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, + `clinvar_ingest.parseSample`(obs.content) as s + FROM + `%s.clinical_assertion_observation` obs + ), + obs_method AS ( + SELECT + REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, + m.description as method_desc, + m.method_type + FROM + `%s.clinical_assertion_observation` obs, + UNNEST(`clinvar_ingest.parseMethods`(obs.content)) as m + + ), + assertion_method AS ( + SELECT + ca.id, + a.attribute.type, + a.attribute.value, + STRING_AGG(DISTINCT c.url,';') as url + FROM `%s.clinical_assertion` ca, + UNNEST(`clinvar_ingest.parseAttributeSet`(ca.content)) as a, + UNNEST(a.citation) as c + WHERE + a.attribute.type = "AssertionMethod" + and + c.url is not null + GROUP BY + ca.id, + a.attribute.type, + a.attribute.value + ), + obs AS ( + SELECT + ca.id, + STRING_AGG(DISTINCT os.s.origin, ", " ORDER BY os.s.origin) as origin, + STRING_AGG(DISTINCT os.s.affected_status, ", " ORDER BY os.s.affected_status) as affected_status, + STRING_AGG(DISTINCT om.method_desc, ", " ORDER BY om.method_desc) as method_desc, + STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type + FROM + `%s.clinical_assertion` ca + LEFT JOIN obs_sample os ON os.id = ca.id + LEFT JOIN obs_method om ON om.id = ca.id + GROUP BY + ca.id + ), + scv_classification_comment AS ( + SELECT + id, + STRING_AGG(JSON_EXTRACT_SCALAR(c, r'$.text'), '\\n') as text + FROM + `%s.clinical_assertion` ca + LEFT JOIN UNNEST(ca.interpretation_comments) as c + WHERE ARRAY_LENGTH(ca.interpretation_comments) > 0 + GROUP BY + id + ), + scv AS ( + SELECT + ca.release_date as release_date, + ca.id, + ca.version, + ca.variation_id, + ca.local_key, + ca.interpretation_date_last_evaluated as last_evaluated, + cvs.rank, + ca.review_status, + cst.clinvar_prop_type as clinvar_stmt_type, + cst.cvc_prop_type as cvc_stmt_type, + ca.interpretation_description as submitted_classification, + IFNULL(map.cv_clinsig_type, '-') as classif_type, + cst.significance, + ca.submitter_id, + ca.submission_id, + ca.clinical_assertion_observation_ids, + `clinvar_ingest.parseComments`(ca.content) as comments, + scc.text as classification_comment, + ca.date_created, + ca.date_last_updated + FROM + `%s.clinical_assertion` ca + LEFT JOIN scv_classification_comment scc + ON + scc.id = ca.id + LEFT JOIN `clinvar_ingest.scv_clinsig_map` map + ON + map.scv_term = lower(IF(ca.id IS NULL, NULL, IFNULL(ca.interpretation_description,'not provided'))) + LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cst + ON + cst.code = map.cv_clinsig_type + LEFT JOIN `clinvar_ingest.clinvar_status` cvs + ON + cvs.label = ca.review_status + ) + SELECT + scv.release_date, + scv.id, + scv.version, + scv.variation_id, + scv.local_key, + scv.last_evaluated, + scv.rank, + scv.review_status, + scv.clinvar_stmt_type, + scv.cvc_stmt_type, + scv.submitted_classification, + scv.classif_type, + scv.significance, + scv.comments, + scv.classification_comment, + scv.date_created, + scv.date_last_updated, + scv.clinical_assertion_observation_ids, + scv.submitter_id, + subm.submission_date, + obs.origin, + obs.affected_status, + obs.method_desc, + obs.method_type, + am.value as assertion_method, + am.url as assertion_method_url + FROM scv + LEFT JOIN obs + ON + obs.id = scv.id + LEFT JOIN assertion_method am + ON + am.id = scv.id + LEFT JOIN `%s.submission` subm + ON + subm.id = scv.submission_id + """, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name); +END; + + +-- repair any null review_status values +-- set any NULL review_status either 'no assertion provided' (if interp_desc is null or not provided) or 'no assertion criteria provided' (otherwise) +-- +-- UPDATE `%s.clinical_assertion` ca +-- SET +-- ca.review_status = +-- IF( +-- ca.interpretation_description is null OR +-- ca.interpretation_description = 'not provided', +-- 'no assertion provided', +-- 'no assertion criteria provided' +-- ) +-- WHERE ca.review_status IS NULL +-- ; \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql b/scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql new file mode 100644 index 0000000..381d0a5 --- /dev/null +++ b/scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql @@ -0,0 +1,134 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary_v2_proc`( + schema_name STRING +) +BEGIN + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `%s.scv_summary` AS + WITH obs_sample AS ( + SELECT + REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, + `clinvar_ingest.parseSample`(obs.content) as s + FROM + `%s.clinical_assertion_observation` obs + ), + obs_method AS ( + SELECT + REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, + m.description as method_desc, + m.method_type + FROM + `%s.clinical_assertion_observation` obs, + UNNEST(`clinvar_ingest.parseMethods`(obs.content)) as m + ), + assertion_method AS ( + SELECT + ca.id, + a.attribute.type, + a.attribute.value, + STRING_AGG(DISTINCT c.url,';') as url + FROM `%s.clinical_assertion` ca, + UNNEST(`clinvar_ingest.parseAttributeSet`(ca.content)) as a, + UNNEST(a.citation) as c + WHERE + a.attribute.type = "AssertionMethod" + and + c.url is not null + GROUP BY + ca.id, + a.attribute.type, + a.attribute.value + ), + obs AS ( + SELECT + ca.id, + STRING_AGG(DISTINCT os.s.origin, ", " ORDER BY os.s.origin) as origin, + STRING_AGG(DISTINCT os.s.affected_status, ", " ORDER BY os.s.affected_status) as affected_status, + STRING_AGG(DISTINCT om.method_desc, ", " ORDER BY om.method_desc) as method_desc, + STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type + FROM + `%s.clinical_assertion` ca + LEFT JOIN obs_sample os ON os.id = ca.id + LEFT JOIN obs_method om ON om.id = ca.id + GROUP BY + ca.id + ), + scv_classification_comment AS ( + SELECT + id, + STRING_AGG(JSON_EXTRACT_SCALAR(c, r'$.text'), '\\n') as text + FROM + `%s.clinical_assertion` ca + LEFT JOIN UNNEST(ca.interpretation_comments) as c + WHERE ARRAY_LENGTH(ca.interpretation_comments) > 0 + GROUP BY + id + ) + SELECT + ca.release_date as release_date, + ca.id, + ca.version, + ca.variation_id, + ca.local_key, + ca.interpretation_date_last_evaluated as last_evaluated, + ca.statement_type, + cvs.rank, + ca.review_status, + cst.clinvar_prop_type as clinvar_stmt_type, + cst.cvc_prop_type as cvc_stmt_type, + ca.interpretation_description as submitted_classification, + IFNULL(map.cv_clinsig_type, '-') as classif_type, + cst.significance, + `clinvar_ingest.parseComments`(ca.content) as comments, + scc.text as classification_comment, + ca.date_created, + ca.date_last_updated, + ca.clinical_assertion_observation_ids, + ca.submitter_id, + subm.submission_date, + obs.origin, + obs.affected_status, + obs.method_desc, + obs.method_type, + am.value as assertion_method, + am.url as assertion_method_url + FROM + `%s.clinical_assertion` ca + LEFT JOIN scv_classification_comment scc + ON + scc.id = ca.id + LEFT JOIN `clinvar_ingest.scv_clinsig_map` map + ON + map.scv_term = lower(IFNULL(ca.interpretation_description,'not provided')) + LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cst + ON + cst.code = map.cv_clinsig_type + LEFT JOIN `clinvar_ingest.clinvar_status` cvs + ON + cvs.label = ca.review_status + LEFT JOIN obs + ON + obs.id = ca.id + LEFT JOIN assertion_method am + ON + am.id = ca.id + LEFT JOIN `%s.submission` subm + ON + subm.id = ca.submission_id + """, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name); +END; + + +-- repair any null review_status values +-- set any NULL review_status either 'no assertion provided' (if interp_desc is null or not provided) or 'no assertion criteria provided' (otherwise) +-- +-- UPDATE `%s.clinical_assertion` ca +-- SET +-- ca.review_status = +-- IF( +-- ca.interpretation_description is null OR +-- ca.interpretation_description = 'not provided', +-- 'no assertion provided', +-- 'no assertion criteria provided' +-- ) +-- WHERE ca.review_status IS NULL +-- ; \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql b/scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql new file mode 100644 index 0000000..02a87bd --- /dev/null +++ b/scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql @@ -0,0 +1,363 @@ +-- see below for helper / setup scripts +CREATE OR REPLACE PROCEDURE `clinvar_ingest.single_gene_variation_v1`( + schema_name STRING, + release_date DATE +) +BEGIN + -- single gene variation plan + --- step 1. create a table with columns variation_id, gene_id, somatic_flag + --- where the variation_id is the pk. + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `%s.single_gene_variation` + ( release_date DATE NOT NULL, + variation_id STRING NOT NULL, + gene_id STRING NOT NULL, + relationship_type STRING, + source STRING, + mane_select BOOL DEFAULT FALSE, + somatic BOOL DEFAULT FALSE) + """, schema_name); + + -- create a temp table that is the list of remaining variations so as to reduce the query cost of analyzing against the variation table. + EXECUTE IMMEDIATE FORMAT(""" + CREATE TEMP TABLE _SESSION.temp_variation + AS + SELECT + v.id, + v.name, + v.descendant_ids, + v.subclass_type + FROM `%s.variation` v + """, schema_name); + + --- step 2. (Resolvable Gene Symbol in variation name takes precedence as "single gene for variant") + --- initialize the set with the extracted variation name gene symbols and the associated relationship info if available + + -- prioritize mane select transcripts in title of variant + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `%s.single_gene_variation` ( + release_date, + variation_id, + gene_id, + relationship_type, + source, + mane_select + ) + SELECT DISTINCT + %T as release_date, + v.id as variation_id, + mane.gene_id, + IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, + IFNULL(ga.source,'cvc calculated') as source, + TRUE as mane_select + FROM `clinvar_ingest.mane_select_gene_transcript` mane + JOIN _SESSION.temp_variation v + ON + STARTS_WITH(v.name, mane.transcript_id) + JOIN `clinvar_ingest.entrez_gene` g + ON + g.gene_id = mane.gene_id + LEFT JOIN `%s.gene_association` ga + ON + ga.gene_id = g.gene_id + and + ga.variation_id = v.id + """, schema_name, release_date, schema_name); + + -- create a temp table that is the list of gene associations + -- for variations not yet in the single gene var table and + -- don't have the gene symbol ending in an -AS# suffix + EXECUTE IMMEDIATE FORMAT(""" + CREATE TEMP TABLE _SESSION.temp_gene_assoc + AS + SELECT DISTINCT + ga.variation_id, + ga.gene_id, + ga.relationship_type, + ga.source + FROM `%s.gene_association` ga + JOIN `clinvar_ingest.entrez_gene` g + ON + ga.gene_id = g.gene_id + WHERE + NOT REGEXP_CONTAINS(g.symbol_from_authority, r'\\-AS\\d$') + AND + NOT EXISTS ( + SELECT + sgv.variation_id + FROM `%s.single_gene_variation` sgv + WHERE + sgv.variation_id = ga.variation_id + ) + """, schema_name, schema_name); + + -- NOTE: clinvar has a handful of duplicate gene records that can change over time + -- the plan will be to test the results of loading the variation_single_gene + -- table to verify that any variants with multiple genes are simply duplicates and + -- either one can be removed without an issue. + + -- clinvar perferred label hgvs-style format NM_0000.0(GENE):c.234... (not mane select but still in name) + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `%s.single_gene_variation` + (release_date, variation_id, gene_id, relationship_type, source) + WITH x AS + ( + SELECT + v.id, + v.name, + REGEXP_EXTRACT(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') as symbol + FROM _SESSION.temp_variation v + WHERE REGEXP_CONTAINS(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') AND + NOT EXISTS ( + SELECT sgv.variation_id + FROM `%s.single_gene_variation` sgv + WHERE sgv.variation_id = v.id + ) + ) + SELECT + %T as release_date, + x.id as variation_id, + g.gene_id, + IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, + IFNULL(ga.source,'cvc calculated') as source + FROM x + JOIN `clinvar_ingest.entrez_gene` g on UPPER(g.symbol_from_authority) = UPPER(x.symbol) AND NOT REGEXP_CONTAINS(x.symbol, r'\\-AS\\d$') + LEFT JOIN _SESSION.temp_gene_assoc ga on ga.variation_id = x.id and ga.gene_id = g.gene_id + """, schema_name, schema_name, release_date); + + + -- star allele format, CYP2C19*10. + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `%s.single_gene_variation` + (release_date, variation_id, gene_id, relationship_type, source) + WITH x AS + ( + SELECT + v.id, + v.name, + REGEXP_EXTRACT(v.name, r'^([A-Za-z0-9\\-]+)[\\*\\,]') as symbol + FROM _SESSION.temp_variation v + WHERE REGEXP_CONTAINS(v.name, r'^([A-Za-z0-9\\-]+)[\\*\\,]') AND + NOT EXISTS ( + SELECT sgv.variation_id + FROM `%s.single_gene_variation` sgv + WHERE sgv.variation_id = v.id + ) + ) + SELECT + %T as release_date, + x.id as variation_id, + g.gene_id, + IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, + IFNULL(ga.source,'cvc calculated') as source + FROM x + JOIN `clinvar_ingest.entrez_gene` g ON UPPER(x.symbol) = UPPER(g.symbol_from_authority) + LEFT JOIN _SESSION.temp_gene_assoc ga ON x.id = ga.variation_id AND g.gene_id = ga.gene_id + """, schema_name, schema_name, release_date); + + --- step 3. for any variations remaining... load all variations with any + --- genes that are mapped one-to-one from the gene association table + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `%s.single_gene_variation` + (release_date, variation_id, gene_id, relationship_type, source) + WITH x AS + ( + SELECT + v.id + FROM _SESSION.temp_variation v + WHERE + NOT EXISTS ( + SELECT sgv.variation_id + FROM `%s.single_gene_variation` sgv + WHERE sgv.variation_id = v.id + ) + ) + SELECT + %T as release_date, + ga.variation_id, + STRING_AGG(ga.gene_id), + STRING_AGG(ga.relationship_type), + STRING_AGG(ga.source) + FROM x + JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id + group by ga.variation_id + having count(distinct ga.gene_id) = 1 + """, schema_name, schema_name, release_date); + + --- step 4. for any variations remaining... load any variant with one submitted gene that + --- is not either "genes overlapped by variant" or "asserted, but not computed" + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `%s.single_gene_variation` + (release_date, variation_id, gene_id, relationship_type, source ) + WITH x AS + ( + SELECT + v.id + FROM _SESSION.temp_variation v + WHERE + NOT EXISTS ( + SELECT sgv.variation_id + FROM `%s.single_gene_variation` sgv + WHERE sgv.variation_id = v.id + ) + ) + SELECT + %T as release_date, + ga.variation_id, + STRING_AGG(ga.gene_id), + STRING_AGG(ga.relationship_type), + STRING_AGG(ga.source) + FROM x + JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id + WHERE + ga.relationship_type not in ('genes overlapped by variant' , 'asserted, but not computed') AND + ga.source = 'submitted' + GROUP BY ga.variation_id + HAVING count(distinct ga.gene_id) = 1 + """, schema_name, schema_name, release_date); + + --- step 5. for any variations remaining... load any variations with a "within single gene" + --- as long as it is associated to only one gene for that variant + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `%s.single_gene_variation` + (release_date, variation_id, gene_id, relationship_type, source) + WITH x AS + ( + SELECT + v.id + FROM _SESSION.temp_variation v + WHERE + NOT EXISTS ( + SELECT sgv.variation_id + FROM `%s.single_gene_variation` sgv + WHERE sgv.variation_id = v.id + ) + ) + SELECT + %T as release_date, + ga.variation_id, + STRING_AGG(ga.gene_id), + STRING_AGG(ga.relationship_type), + STRING_AGG(ga.source) + FROM x + JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id + WHERE (ga.relationship_type = 'within single gene') + GROUP BY ga.variation_id + having count(ga.gene_id) = 1 + """, schema_name, schema_name, release_date); + + --- last step. for any variations remaining... load any haplotype or genotype variations only if all the children have the same gene_id + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO `%s.single_gene_variation` + (release_date, variation_id, gene_id, relationship_type, source ) + WITH x AS + ( + SELECT + v.id + FROM _SESSION.temp_variation v + WHERE + NOT EXISTS ( + SELECT sgv.variation_id + FROM `%s.single_gene_variation` sgv + WHERE sgv.variation_id = v.id + ) + ) + SELECT + %T as release_date, + v.id as variation_id, + STRING_AGG(ga.gene_id) as gene_id, + 'association not provided by clinvar' as relationship_type, + 'cvc calculated' as source + FROM x + JOIN _SESSION.temp_variation v ON x.id = v.id + CROSS JOIN UNNEST(v.descendant_ids) AS descendant_id + JOIN _SESSION.temp_variation d ON + d.id = descendant_id AND + d.subclass_type='SimpleAllele' + LEFT JOIN `%s.gene_association` ga on ga.variation_id = d.id + WHERE ARRAY_LENGTH(v.descendant_ids) > 0 + GROUP BY v.id + HAVING COUNT(ga.gene_id) = 1 + """, schema_name, schema_name, release_date, schema_name); + + --- Finally, update somatic flags based on current onco-gene list (should this be a + --- 'change over time' capture of the onco-gene list's state?) + EXECUTE IMMEDIATE FORMAT(""" + UPDATE `%s.single_gene_variation` sgv + SET sgv.somatic = TRUE + WHERE EXISTS ( + SELECT cg.hgnc_id from `clinvar_ingest.cancer_genes` cg + join `clinvar_ingest.entrez_gene` g on g.hgnc_id = cg.hgnc_id + WHERE g.gene_id = sgv.gene_id + ) + """, schema_name); + + DROP TABLE _SESSION.temp_variation; + DROP TABLE _SESSION.temp_gene_assoc; + +END; + + + +-- -- PRE 2019-07-01 data setup one-off +-- -- remove duplicates from contrived gene_association table in pre-2019-07-01 data +-- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.gene_association` +-- AS +-- SELECT x.release_date, x.gene_id, x.variation_id, x.relationship_type, x.source +-- FROM ( +-- SELECT +-- release_date, gene_id, variation_id, relationship_type, source, +-- ROW_NUMBER() +-- OVER (PARTITION BY release_date, gene_id, variation_id) +-- row_number +-- FROM `clinvar_2019_06_01_v0.gene_association` +-- ) x +-- WHERE x.row_number = 1 +-- ; + +-- -- contrive a single_gene_variation table for the pre-2019-07-01 data +-- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.single_gene_variation` +-- AS +-- WITH x AS +-- ( +-- select release_date, variation_id +-- from `clinvar_2019_06_01_v0.gene_association` +-- group by release_date, variation_id +-- having count(*) = 1 +-- ) +-- SELECT +-- x.release_date, +-- ga.gene_id, +-- x.variation_id, +-- ga.relationship_type, +-- ga.source, +-- (cg.hgnc_id IS NOT NULL) as somatic, +-- FALSE as mane_select +-- FROM x +-- join `clinvar_2019_06_01_v0.gene_association` ga on x.release_date = ga.release_date and x.variation_id = ga.variation_id +-- left join `clinvar_2019_06_01_v0.gene` g on g.id = ga.gene_id and x.release_date = g.release_date +-- left join `clinvar_ingest.cancer_genes` cg on g.hgnc_id = cg.hgnc_id +-- ; + + + -- -- validate between steps?... + -- select vsg.* + -- from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg + -- join ( select vsg2.variation_id + -- from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg2 + -- group by vsg2.variation_id having count(distinct vsg2.gene_id) > 1) vsg2 on vsg.variation_id = vsg2.variation_id + -- order by 1,3; + + + -- -- Helper: To Find duplicate gene ids in release + -- select count(distinct ga.variation_id), ga.gene_id, g.symbol, g.hgnc_id + -- from `clinvar_2022_05_17_v1_6_46.gene_association` ga, + -- `clinvar_2022_05_17_v1_6_46.gene` g, + -- (SELECT ARRAY_AGG(id) as gene_ids + -- from `clinvar_2022_05_17_v1_6_46.gene` g + -- group by hgnc_id, symbol, release_date, full_name + -- having count(distinct id)>1 ) as gx, + -- UNNEST(gx.gene_ids) as gid + -- WHERE gid = ga.gene_id and g.id = ga.gene_id + -- group by ga.gene_id, g.symbol, g.hgnc_id + -- order by 3,1; \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/03-gc-scv-proc.sql b/scripts/ds-staging-table-procs/05-gc-scv-v1-proc.sql similarity index 85% rename from scripts/ds-staging-table-procs/03-gc-scv-proc.sql rename to scripts/ds-staging-table-procs/05-gc-scv-v1-proc.sql index c00ebd5..8f0c1b5 100644 --- a/scripts/ds-staging-table-procs/03-gc-scv-proc.sql +++ b/scripts/ds-staging-table-procs/05-gc-scv-v1-proc.sql @@ -1,8 +1,7 @@ -CREATE OR REPLACE PROCEDURE - `clinvar_ingest.gc_scv_proc`(start_with DATE) +CREATE OR REPLACE PROCEDURE `clinvar_ingest.gc_scv_v1`( + schema_name STRING +) BEGIN - FOR rec IN (SELECT s.schema_name, s.release_date, s.prev_release_date, s.next_release_date FROM clinvar_ingest.schemas_on_or_after(start_with) AS s) - DO EXECUTE IMMEDIATE FORMAT(""" CREATE OR REPLACE TABLE `%s.gc_scv` AS @@ -29,13 +28,15 @@ BEGIN SELECT od.attribute.value FROM UNNEST(`clinvar_ingest.parseObservedData`(cao.content)) od - WHERE od.attribute.type = 'SampleLocalID' + WHERE + od.attribute.type = 'SampleLocalID' ) as sample_id, ( SELECT od.attribute.value FROM UNNEST(`clinvar_ingest.parseObservedData`(cao.content)) od - WHERE od.attribute.type = 'SampleVariantID' + WHERE + od.attribute.type = 'SampleVariantID' ) as sample_variant_id, cao.id as scv_obs_id FROM `variation_tracker.report_submitter` rs @@ -73,6 +74,5 @@ BEGIN lower(cct.label) = lower(tc.lab.classification) WHERE IFNULL(tc.lab.name, IFNULL(tc.lab.id,tc.lab.classification)) IS NOT NULL - """, rec.schema_name, rec.schema_name, rec.schema_name); - END FOR; + """, schema_name, schema_name, schema_name); END; \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql b/scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql new file mode 100644 index 0000000..0183aa8 --- /dev/null +++ b/scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql @@ -0,0 +1,20 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.dataset_preparation_v1`( + on_date DATE +) +BEGIN + FOR rec IN ( + select + s.schema_name, + s.release_date, + s.prev_release_date, + s.next_release_date + FROM clinvar_ingest.schema_on_v2(on_date) as s + ) + DO + CALL `clinvar_ingest.validate_dataset_terms_v1`(rec.schema_name); + CALL `clinvar_ingest.scv_summary_v1`(rec.schema_name); + CALL `clinvar_ingest.single_gene_variation_v1`(rec.schema_name, rec.release_date); + CALL `clinvar_ingest.gc_scv_v1`(rec.schema_name, rec.release_date); + END FOR; + +END; \ No newline at end of file diff --git a/scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql b/scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql new file mode 100644 index 0000000..7b5ae47 --- /dev/null +++ b/scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql @@ -0,0 +1,21 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.dataset_preparation_v2`( + on_date DATE +) +BEGIN + FOR rec IN ( + select + s.schema_name, + s.release_date, + s.prev_release_date, + s.next_release_date + FROM clinvar_ingest.schema_on_v2(on_date) as s + ) + DO + CALL `clinvar_ingest.validate_dataset_v1`(rec.schema_name); + CALL `clinvar_ingest.normalize_dataset_v2`(rec.schema_name); + CALL `clinvar_ingest.scv_summary_v2`(rec.schema_name); + CALL `clinvar_ingest.single_gene_variation_v1`(rec.schema_name, rec.release_date); + CALL `clinvar_ingest.gc_scv_v1`(rec.schema_name); + END FOR; + +END; \ No newline at end of file diff --git a/scripts/general/schemas-on-or-after-v2-func.sql b/scripts/general/schemas-on-or-after-v2-func.sql index a73b1ba..905b77f 100644 --- a/scripts/general/schemas-on-or-after-v2-func.sql +++ b/scripts/general/schemas-on-or-after-v2-func.sql @@ -14,4 +14,4 @@ AS ( ORDER BY 2 ); --- select * from `clinvar_ingest.schemas_on_or_after`(DATE('2020-06-01')); \ No newline at end of file +-- select * from `clinvar_ingest.schemas_on_or_after_v2`(DATE('2020-06-01')); \ No newline at end of file diff --git a/scripts/general/tables-columns-exists-proc.sql b/scripts/general/tables-columns-exists-proc.sql new file mode 100644 index 0000000..4e69d92 --- /dev/null +++ b/scripts/general/tables-columns-exists-proc.sql @@ -0,0 +1,30 @@ +CREATE OR REPLACE PROCEDURE `clinvar_ingest.check_table_exists`( + schema_name STRING, + table_name STRING, + OUT table_exists BOOL +) +BEGIN + EXECUTE IMMEDIATE FORMAT(""" + SELECT EXISTS( + SELECT 1 + FROM `%s.INFORMATION_SCHEMA.TABLES` + WHERE table_name = '%s' + ) + """, schema_name, table_name) INTO table_exists; +END; + +CREATE OR REPLACE PROCEDURE `clinvar_ingest.check_column_exists`( + schema_name STRING, + table_name STRING, + column_name STRING, + OUT column_exists BOOL +) +BEGIN + EXECUTE IMMEDIATE FORMAT(""" + SELECT EXISTS( + SELECT 1 + FROM `%s.INFORMATION_SCHEMA.COLUMNS` + WHERE table_name = '%s' AND column_name = '%s' + ) + """, schema_name, table_name, column_name) INTO column_exists; +END; \ No newline at end of file