Skip to content

Commit

Permalink
merge v2 and original XML processing into single procs that are proje…
Browse files Browse the repository at this point in the history
…ct aware
  • Loading branch information
larrybabb committed Dec 24, 2024
1 parent 7ed98d9 commit 21ee147
Show file tree
Hide file tree
Showing 35 changed files with 3,131 additions and 3,073 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ BEGIN
FROM `INFORMATION_SCHEMA.SCHEMATA`
WHERE schema_name = 'clinvar_ingest');

IF (project_id = 'clingen_stage') THEN
IF (project_id = 'clingen-stage') THEN
CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_clinsig_types`
AS
SELECT
Expand Down
37 changes: 37 additions & 0 deletions scripts/dataset-preparation/01-validate-ds-terms-proc.sql
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@

CREATE OR REPLACE PROCEDURE `clinvar_ingest.validate_dataset_terms`(
schema_name STRING
)
BEGIN
-- Declare variables to hold results and error messages
DECLARE scv_classification_terms ARRAY<STRING>;
DECLARE scv_classification_statement_combo_terms ARRAY<STRING>;
DECLARE scv_review_status_terms ARRAY<STRING>;
DECLARE combined_issues STRING;
DECLARE project_id STRING;

SET project_id = (
SELECT
catalog_name as paroject_id
FROM `INFORMATION_SCHEMA.SCHEMATA`
WHERE
schema_name = 'clinvar_ingest'
);

-- Check for new interpretation_descriptions in clinical_assertion
EXECUTE IMMEDIATE FORMAT("""
Expand All @@ -18,6 +29,24 @@ BEGIN
map.scv_term IS NULL
""", schema_name) INTO scv_classification_terms;

IF (project_id <> 'clingen-stage') THEN
-- Check for interpretatoin_descrition+statement_type combos not available in clinvar_clinsig_types
EXECUTE IMMEDIATE FORMAT("""
SELECT
ARRAY_AGG(DISTINCT IFNULL(ca.interpretation_description,'null') || ' + ' || ca.statement_type)
FROM `%s.clinical_assertion` ca
LEFT JOIN `clinvar_ingest.scv_clinsig_map` map
ON
map.scv_term = lower(IFNULL(ca.interpretation_description,'not provided'))
LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cst
ON
cst.code = map.cv_clinsig_type AND
cst.statement_type = ca.statement_type
WHERE
cst.code IS NULL
""", schema_name) INTO scv_classification_statement_combo_terms;
END IF

-- Check for new review_status terms in clinical_assertion
EXECUTE IMMEDIATE FORMAT("""
SELECT ARRAY_AGG(DISTINCT IFNULL(ca.review_status,'null'))
Expand All @@ -39,6 +68,14 @@ BEGIN
""", ARRAY_TO_STRING(scv_classification_terms, ', '));
END IF;

IF scv_classification_statement_combo_terms IS NOT NULL AND ARRAY_LENGTH(scv_classification_statement_combo_terms) > 0 THEN
SET combined_issues = FORMAT("""
%s
New SCV classification+statement_type combos found: [%s].
NOTE: Add clinvar_clinsig_types records to the '00-setup-translation-tables.sql' script and update, then rerun this script.
""", combined_issues, ARRAY_TO_STRING(scv_classification_statement_combo_terms, ', '));
END IF;

IF scv_review_status_terms IS NOT NULL AND ARRAY_LENGTH(scv_review_status_terms) > 0 THEN
SET combined_issues = FORMAT("""
%s
Expand Down
80 changes: 0 additions & 80 deletions scripts/dataset-preparation/01-validate-ds-terms-v2-proc.sql

This file was deleted.

221 changes: 221 additions & 0 deletions scripts/dataset-preparation/02-normalize-ds-proc.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
CREATE OR REPLACE PROCEDURE `clinvar_ingest.normalize_dataset`(
schema_name STRING -- Name of schema/dataset
)
BEGIN
DECLARE table_exists BOOL;
DECLARE column_exists BOOL;
DECLARE project_id STRING;

SET project_id = (
SELECT
catalog_name as paroject_id
FROM `INFORMATION_SCHEMA.SCHEMATA`
WHERE
schema_name = 'clinvar_ingest'
);

IF (project_id <> 'clingen-stage') THEN
-- TABLE 1. Clinical Assertion
-- check for clinical_assertion.statement_type column as THE indicator that the dataset has been normalized to v2
CALL `clinvar_ingest.check_column_exists`(schema_name, 'clinical_assertion', 'statement_type', column_exists);

-- if the column does not exist, add it with the default value
IF NOT column_exists THEN
-- backup the original clinical_assertion table
EXECUTE IMMEDIATE FORMAT("""
CREATE TABLE `%s.backup_clinical_assertion`
AS
SELECT
*
FROM `%s.clinical_assertion`
""", schema_name, schema_name);

-- create or replace the clinical_assertion table from the backup
EXECUTE IMMEDIATE FORMAT("""
CREATE OR REPLACE TABLE `%s.clinical_assertion`
AS
SELECT
*,
'GermlineClassification' as statement_type,
CAST(NULL as STRING) as clinical_impact_assertion_type,
CAST(NULL as STRING) as clinical_impact_clinical_significance
FROM `%s.backup_clinical_assertion`
""", schema_name, schema_name);
END IF;

-- TABLE 2. RCV Accession & RCV Accession Classification (with corrections for v2 rcv_accession_classification.agg_classification column)
-- check that the rcv_accession_classification table exists as THE indicator that the dataset has been normalized to v2
CALL `clinvar_ingest.check_table_exists`(schema_name, 'rcv_accession_classification', table_exists);

-- if the table does not exist, create it
IF NOT table_exists THEN
-- backup the original rcv_accession table
EXECUTE IMMEDIATE FORMAT("""
CREATE TABLE `%s.backup_rcv_accession`
AS
SELECT
*
FROM `%s.rcv_accession`
""", schema_name, schema_name);

-- create the rcv_accession_classification table from the backup
EXECUTE IMMEDIATE FORMAT("""
CREATE TABLE %s.rcv_accession_classification
AS
SELECT
release_date,
id as rcv_id,
'GermlineClassification' AS statement_type,
review_status,
[
STRUCT(
submission_count as num_submissions,
date_last_evaluated,
interpretation as interp_description,
CAST(NULL as STRING) as clinical_impact_assertion_type,
CAST(NULL as STRING) as clinical_impact_clinical_significance
)
] as agg_classification
FROM `%s.backup_rcv_accession`
""", schema_name, schema_name);

-- create or replace the rcv_accession_classification table from the backup
EXECUTE IMMEDIATE FORMAT("""
CREATE OR REPLACE TABLE `%s.rcv_accession`
AS
SELECT
release_date,
id,
variation_id,
independent_observations,
variation_archive_id,
version,
title,
trait_set_id,
content
FROM `%s.backup_rcv_accession`
""", schema_name, schema_name);
ELSE
-- if the table exists, check if the agg_classification column exists
CALL `clinvar_ingest.check_column_exists`(schema_name, 'rcv_accession_classification', 'agg_classification', column_exists);

-- if the column does not exist, convert the v2 table to the final format
IF NOT column_exists THEN
-- backup the original rcv_accession_classification table
EXECUTE IMMEDIATE FORMAT("""
CREATE TABLE `%s.backup_rcv_accession_classification`
AS
SELECT
*
FROM `%s.rcv_accession_classification`
""", schema_name, schema_name);

-- create or replace the rcv_accession_classification table from the backup
EXECUTE IMMEDIATE FORMAT("""
CREATE OR REPLACE TABLE `%s.rcv_accession_classification`
AS
SELECT
release_date,
rcv_id,
statement_type,
review_status,
`clinvar_ingest.parseAggDescription`(content).description as agg_classification,
IF(
REGEXP_CONTAINS(content, r'^{\\s*"Description"\\s*\\:\\s*"[^"]+"\\s*}'),
NULL,
REGEXP_REPLACE(content, r'"Description"\\s*\\:\\s*"[^"]+"\\s*,*\\s*', "")
) as content
FROM `%s.rcv_accession_classification`
WHERE
content is not null
UNION ALL
SELECT
release_date,
rcv_id,
statement_type,
review_status,
[
STRUCT(
clinical_impact_assertion_type,
clinical_impact_clinical_significance,
date_last_evaluated,
num_submissions,
interp_description
)
] as agg_classification,
content
FROM `%s.rcv_accession_classification`
WHERE
content is null
""", schema_name, schema_name, schema_name);
END IF;
END IF;

-- TABLE 3. Variation Archive & Variation Archive Classification
-- check that the variation_archive_classification table exists as THE indicator that the dataset has been normalized to v2
CALL `clinvar_ingest.check_table_exists`(schema_name, 'variation_archive_classification', table_exists);

-- if the table does not exist, create it
IF NOT table_exists THEN
-- backup the original variation_archive table
EXECUTE IMMEDIATE FORMAT("""
CREATE TABLE `%s.backup_variation_archive`
AS
SELECT
*
FROM `%s.variation_archive`
""", schema_name, schema_name);

-- create the variation_archive_classification table from the backup
EXECUTE IMMEDIATE FORMAT("""
CREATE TABLE %s.variation_archive_classification
AS
SELECT
id as vcv_id,
'GermlineClassification' AS statement_type,
review_status,
num_submitters,
num_submissions,
date_created,
interp_date_last_evaluated,
interp_description,
interp_explanation,
CAST(JSON_VALUE(REPLACE(content, '@MostRecentSubmission', 'MostRecentSubmission'), '$.MostRecentSubmission') AS DATE) AS most_recent_submission,
interp_content as content,
CAST(NULL as STRING) as clinical_impact_assertion_type,
CAST(NULL as STRING) as clinical_impact_clinical_significance
FROM `%s.backup_variation_archive`
""", schema_name, schema_name);

-- create or replace the variation_archive table from the backup
EXECUTE IMMEDIATE FORMAT("""
CREATE OR REPLACE TABLE `%s.variation_archive`
AS
SELECT
date_created,
record_status,
variation_id,
release_date,
IF(
REGEXP_CONTAINS(content, r'^{\\s*"@MostRecentSubmission"\\s*\\:\\s*"[^"]+"\\s*}'),
NULL,
REGEXP_REPLACE(content, r'"@MostRecentSubmission"\\s*\\:\\s*"[^"]+"\\s*,*\\s*', "")
) as content,
species,
id,
version,
num_submitters,
date_last_updated,
num_submissions
FROM `%s.backup_variation_archive`
""", schema_name, schema_name);
END IF;

END IF;

END;

-- tested on older set and it added the col and updated the values to the default.
-- CALL `clinvar_ingest.normalize_dataset`('clinvar_2023_01_07_v1_6_57')
-- tested on newer set and no col was added or updated
-- CALL `clinvar_ingest.normalize_dataset`('clinvar_2024_11_26_v2_0_1_alpha')
Loading

0 comments on commit 21ee147

Please sign in to comment.