From f9ad2a9e1e5c1c3bd4da8b97a8b4e576b32bfcdf Mon Sep 17 00:00:00 2001
From: Lawrence Babb <lbabb@broadinstitute.org>
Date: Mon, 9 Dec 2024 09:10:16 -0500
Subject: [PATCH] refactor clinvar-ingest dataset prep procs with versioning

---
 .../00-setup-translation-tables.sql           | 174 +++++++++
 .../01-scv-summary-proc.sql                   | 162 --------
 .../01-validate-ds-terms-v1-proc.sql          |  51 +++
 .../02-normalize-ds-v2-proc.sql               | 192 +++++++++
 .../02-single-gene-variation-proc.sql         | 348 -----------------
 .../03-scv-summary-v1-proc.sql                | 158 ++++++++
 .../03-scv-summary-v2-proc.sql                | 134 +++++++
 .../04-single-gene-variation-v1-proc.sql      | 363 ++++++++++++++++++
 ...-gc-scv-proc.sql => 05-gc-scv-v1-proc.sql} |  16 +-
 .../dataset-preparation-v1-proc.sql           |  20 +
 .../dataset-preparation-v2-proc.sql           |  21 +
 .../general/schemas-on-or-after-v2-func.sql   |   2 +-
 .../general/tables-columns-exists-proc.sql    |  30 ++
 13 files changed, 1152 insertions(+), 519 deletions(-)
 create mode 100644 scripts/ds-staging-table-procs/00-setup-translation-tables.sql
 delete mode 100644 scripts/ds-staging-table-procs/01-scv-summary-proc.sql
 create mode 100644 scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql
 create mode 100644 scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql
 delete mode 100644 scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql
 create mode 100644 scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql
 create mode 100644 scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql
 create mode 100644 scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql
 rename scripts/ds-staging-table-procs/{03-gc-scv-proc.sql => 05-gc-scv-v1-proc.sql} (85%)
 create mode 100644 scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql
 create mode 100644 scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql
 create mode 100644 scripts/general/tables-columns-exists-proc.sql

diff --git a/scripts/ds-staging-table-procs/00-setup-translation-tables.sql b/scripts/ds-staging-table-procs/00-setup-translation-tables.sql
new file mode 100644
index 0000000..4405c64
--- /dev/null
+++ b/scripts/ds-staging-table-procs/00-setup-translation-tables.sql
@@ -0,0 +1,174 @@
+-- CREATE OR REPLACE TABLE `clinvar_ingest.cvc_context_types` (code STRING, label STRING, display_order INT64);  
+
+-- INSERT INTO `clinvar_ingest.cvc_context_types` (code, label, display_order) 
+-- VALUES 
+--     ('gd','Germline Disease', 10),
+--     ('sc','Somatic Cancer',   20),
+--     ('pg','Pharmacogenomic',  30),
+--     ('ot','Other',            40);
+
+CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_clinsig_types` (
+    code STRING, label STRING, significance INT64, 
+    clinvar_prop_type STRING, clinvar_code_order INT64, 
+    clinvar_desc_order INT64, cvc_prop_type STRING, 
+    cvc_code_order INT64, cvc_desc_order INT64, 
+    direction STRING, strength_code STRING, strength_label STRING, 
+    classification_code STRING, penetrance_level STRING
+);  
+INSERT INTO `clinvar_ingest.clinvar_clinsig_types` (
+    code, 
+    label, 
+    significance, 
+    clinvar_prop_type, 
+    clinvar_code_order, 
+    clinvar_desc_order, 
+    cvc_prop_type, 
+    cvc_code_order, 
+    cvc_desc_order, 
+    direction, 
+    strength_code, 
+    strength_label, 
+    classification_code, 
+    penetrance_level
+) 
+VALUES 
+    -- Pathogenic statements
+    ('b',       'Benign',                            0, '?',    30,  30, '?',  30,  30,  'refutes',   'cg000101', 'definitive',    'cg000001', null),
+    ('lb',      'Likely benign',                     0, '?',    31,  31, '?',  31,  31,  'refutes',   'cg000102', 'likely',        'cg000002', null),
+    ('b/lb',    'Benign/Likely benign',              0, '?',    32,  32, '?',  32,  32,  'refutes',   'cg000102', 'likely',        'cg000003', null),
+    ('vus',     'Uncertain significance',            1, '?',    20,  20, '?',  20,  20,  'none',      'cg000103', 'inconclusive',  'cg000004', null),
+    ('ura',     'Uncertain risk allele',             1, '?',    21,  21, '?',  21,  21,  'none',      'cg000103', 'inconclusive',  'cg000005', 'risk allele'),
+    ('p',       'Pathogenic',                        2, '?',    10,  10, '?',  10,  10,  'supports',  'cg000101', 'definitive',    'cg000006', null),
+    ('lp',      'Likely pathogenic',                 2, '?',    11,  11, '?',  11,  11,  'supports',  'cg000102', 'likely',        'cg000007', null),
+    ('p/lp',    'Pathogenic/Likely pathogenic',      2, '?',    12,  12, '?',  12,  12,  'supports',  'cg000102', 'likely',        'cg000008', null),
+    ('p-lp',    'Pathogenic, low penetrance',        2, '?',    13,  13, '?',  13,  13,  'supports',  'cg000101', 'definitive',    'cg000009', 'low'),
+    ('lp-lp',   'Likely pathogenic, low penetrance', 2, '?',    14,  14, '?',  14,  14,  'supports',  'cg000102', 'likely',        'cg000010', 'low'),
+    ('era',     'Established risk allele',           2, '?',    15,  15, '?',  15,  15,  'supports',  'cg000101', 'definitive',    'cg000011', 'risk allele'),
+    ('lra',     'Likely risk allele',                2, '?',    16,  16, '?',  16,  16,  'supports',  'cg000102', 'likely',        'cg000012', 'risk allele'),
+    ('cdfs',    'conflicting data from submitters',  1, '?',    40,  40, '?',  40,  40,  'none',      'cg000103', 'inconclusive',  'cg000013', null),
+    -- ClinVarDrugResponse statements
+    ('dr',      'drug response',                     2, 'dr',      130, 130, 'dr',   130, 130, 'supports',  'cg000100', 'not specified', 'cg000014', null),
+    -- ClinVarNonAssertion statements
+    ('np',      'not provided',                      0, '?',      140, 140, 'oth',  140, 140, 'none',      'cg000100', 'not specified', 'cg000015', null),
+    -- ClinVarOther statements
+    ('rf',      'risk factor',                       2, 'rf',      170, 170, 'oth',  170, 170, 'none',      'cg000100', 'not specified', 'cg000016', null),
+    ('aff',     'Affects',                           2, 'aff',     100, 100, 'oth',  100, 100, 'none',      'cg000100', 'not specified', 'cg000017', null),
+    ('assoc',   'association',                       2, 'assoc',   110, 110, 'oth',  110, 110, 'none',      'cg000100', 'not specified', 'cg000018', null),
+    ('assocnf', 'association not found',             0, 'assoc',   111, 111, 'oth',  111, 111, 'none',      'cg000100', 'not specified', 'cg000019', null),
+    ('cs',      'confers sensitivity',               2, 'cs',      120, 120, 'oth',  120, 120, 'none',      'cg000100', 'not specified', 'cg000020', null),
+    ('oth',     'other',                             0, 'oth',     150, 150, 'oth',  150, 150, 'none',      'cg000100', 'not specified', 'cg000021', null),
+    ('protect', 'protective',                        0, 'protect', 160, 160, 'oth',  160, 160, 'none',      'cg000100', 'not specified', 'cg000022', null),
+    -- SomaticImpact
+    ('t1', 'Tier I - Strong',                        2, 'somatic', 10, 10, 'somatic', 10, 10,  'supports',  'cg000100', 'definitive',    'cg000023', null),
+    ('t2', 'Tier II - Potential',                    2, 'somatic', 11, 11, 'somatic', 11, 11,  'supports',  'cg000102', 'likely',        'cg000024', null),
+    ('t3', 'Tier III - Unknown',                     1, 'somatic', 20, 20, 'somatic', 20, 20,  'none',      'cg000103', 'inconclusive',  'cg000025', null),
+    ('t4', 'Tier IV - Benign/Likely benign',         0, 'somatic', 32, 32, 'somatic', 32, 32,  'refutes',   'cg000102', 'likely',        'cg000026', null),
+    -- Oncogenic
+    ('o',       'Oncogenic',                         2, '?',    10,  10, 'onco',   10, 10,  'supports',  'cg000101', 'definitive',    'cg000027', null),
+    ('lo',      'Likely oncogenic',                  2, '?',    11,  11, 'onco',   11, 11,  'supports',  'cg000102', 'likely',        'cg000028', null);
+
+
+CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_proposition_types` (code STRING, label STRING, display_order INT64);  
+INSERT INTO `clinvar_ingest.clinvar_proposition_types` (code, label, display_order) 
+VALUES 
+    ('path', 'Pathogenicity', 10),
+    ('dr',   'DrugResponse', 11),  
+    ('oth',  'Other', 12),
+    ('somatic', 'SomaticClinicalImpact', 20),
+    ('onco', 'Oncogenicity', 30);
+
+    -- ('rf',   'RiskFactor',30),
+    -- ('np',   'NotProvided',60),
+    -- put affects, associated, confresSensitivity and protect in other category
+    -- ('aff',  'Affects',20),
+    -- ('assoc','Associated',30),
+    -- ('cs',   'ConfersSensitivity',40),
+    -- ('protect','Protective',80),
+
+
+CREATE OR REPLACE TABLE `clinvar_ingest.scv_clinsig_map` (scv_term STRING, cv_clinsig_type STRING);  
+INSERT INTO `clinvar_ingest.scv_clinsig_map` (scv_term, cv_clinsig_type) VALUES 
+    ('affects', 'aff'),
+    ('associated with leiomyomas', 'np'),
+    ('association', 'assoc'),
+    ('association not found', 'assocnf'),
+    ('benign', 'b'),
+    ('benign/likely benign', 'b/lb'),
+    ('cancer', 'oth'),
+    ('confers sensitivity', 'cs'),
+    ('conflicting data from submitters', 'cdfs'),
+    ('drug response', 'dr'),
+    ('drug-response', 'dr'),
+    ('established risk allele', 'era'),
+    ('likely benign', 'lb'),
+    ('likely oncogenic', 'lo'),
+    ('likely pathogenic', 'lp'),
+    ('likely pathogenic - adrenal bilateral pheochromocy', 'lp'),
+    ('likely pathogenic - adrenal pheochromocytoma', 'lp'),
+    ('likely pathogenic, low penetrance', 'lp-lp'),
+    ('likely risk allele', 'lra'),
+    ('moderate', 'p'),
+    ('mut', 'p'),
+    ('mutation', 'p'),
+    ('no known pathogenicity', 'b'),
+    ('non-pathogenic', 'b'),
+    ('not provided', 'np'),
+    ('oncogenic', 'o'),
+    ('other', 'oth'),
+    ('pathogenic', 'p'),
+    ('pathogenic, low penetrance', 'p-lp'),
+    ('pathogenic variant for bardet-biedl syndrome', 'p'),
+    ('pathogenic/likely pathogenic', 'p/lp'),
+    ('pathologic', 'p'),
+    ('poly', 'b'),
+    ('probable-non-pathogenic', 'lb'), 
+    ('probable-pathogenic', 'lp'),
+    ('probably not pathogenic', 'lb'),
+    ('probably pathogenic', 'lp'),
+    ('protective', 'protect'),
+    ('risk factor', 'rf'),
+    ('suspected benign', 'lb'),
+    ('suspected pathogenic', 'lp'),
+    ('tier i - strong', 't1'),
+    ('tier ii - potential', 't2'),
+    ('tier iii - unknown', 't3'),
+    ('tier iv - benign/likely benign', 't4'),
+    ('uncertain', 'vus'),
+    ('uncertain risk allele', 'ura'),
+    ('uncertain significance', 'vus'),
+    ('unknown', 'vus'),
+    ('unknown significance', 'vus'),
+    ('untested', 'np'),
+    ('variant of unknown significance', 'vus'),
+    ('vsb', 'lb'),
+    ('vlb', 'lb'),
+    ('vous', 'vus'),
+    ('vus', 'vus'),
+    ('vlm', 'lp'),
+    ('uncertain significance: likely benign', 'vus'),
+    ('uncertain significance: likely pathogenic','vus'),
+    ('na','oth');
+
+CREATE OR REPLACE TABLE `clinvar_ingest.clinvar_status` (rank INT64, label STRING, scv BOOL);
+INSERT INTO `clinvar_ingest.clinvar_status` (rank, label, scv) 
+VALUES
+    (-3, 'no classifications from unflagged records', FALSE),
+    (-3, 'flagged submission', TRUE),
+    (-2, 'no interpretation for the single variant', FALSE),
+    (-2, 'no classification for the single variant', FALSE),
+    (-1, 'no assertion provided', TRUE),
+    (-1, 'no classification provided', TRUE),
+    (-1, 'not classified by submitter', FALSE),
+    (0,  'no assertion criteria provided', TRUE),
+    (1,  'criteria provided, single submitter', TRUE),
+    (1,  'classified by single submitter', FALSE),
+    (1,  'criteria provided, conflicting interpretations', FALSE),
+    (1,  'criteria provided, conflicting classifications', FALSE),
+    (2,  'criteria provided, multiple submitters, no conflicts', FALSE),
+    (2,  'criteria provided, multiple submitters', FALSE),
+    (2,  'classified by multiple submitters', FALSE),
+    (3,  'reviewed by expert panel', TRUE),
+    (4,  'practice guideline', TRUE),
+    (4,  'reviewed by professional society', FALSE) ;
+
+ 
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/01-scv-summary-proc.sql b/scripts/ds-staging-table-procs/01-scv-summary-proc.sql
deleted file mode 100644
index 7daefe0..0000000
--- a/scripts/ds-staging-table-procs/01-scv-summary-proc.sql
+++ /dev/null
@@ -1,162 +0,0 @@
-CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary_proc`(start_with DATE)
-BEGIN
-
-  FOR rec IN (select s.schema_name, s.release_date, s.prev_release_date, s.next_release_date FROM clinvar_ingest.schemas_on_or_after(start_with) as s)
-  DO
-    EXECUTE IMMEDIATE FORMAT("""
-      CREATE OR REPLACE TABLE `%s.scv_summary` AS
-      WITH obs_sample AS (
-        SELECT
-          REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, 
-          `clinvar_ingest.parseSample`(obs.content) as s
-        FROM 
-          `%s.clinical_assertion_observation` obs
-      ),
-      obs_method AS (
-        SELECT
-          REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, 
-          m.description as method_desc,
-          m.method_type
-        FROM 
-          `%s.clinical_assertion_observation` obs,
-          UNNEST(`clinvar_ingest.parseMethods`(obs.content)) as m          
-        
-      ),
-      assertion_method AS (
-        SELECT 
-          ca.id, 
-          a.attribute.type,
-          a.attribute.value,
-          STRING_AGG(DISTINCT c.url,';') as url 
-        FROM `%s.clinical_assertion` ca,
-        UNNEST(`clinvar_ingest.parseAttributeSet`(ca.content)) as a,
-        UNNEST(a.citation) as c
-        WHERE 
-          a.attribute.type = "AssertionMethod" 
-          and 
-          c.url is not null
-        GROUP BY 
-          ca.id, 
-          a.attribute.type, 
-          a.attribute.value
-      ),
-      obs AS (
-        SELECT 
-          ca.id,
-          STRING_AGG(DISTINCT os.s.origin, ", " ORDER BY os.s.origin) as origin,
-          STRING_AGG(DISTINCT os.s.affected_status, ", " ORDER BY os.s.affected_status) as affected_status,
-          STRING_AGG(DISTINCT om.method_desc, ", " ORDER BY om.method_desc) as method_desc,
-          STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type
-        FROM 
-          `%s.clinical_assertion` ca
-        LEFT JOIN obs_sample os ON os.id = ca.id
-        LEFT JOIN obs_method om ON om.id = ca.id
-        GROUP BY
-          ca.id
-      ),
-      scv_classification_comment AS (
-        SELECT
-          id,
-          STRING_AGG(JSON_EXTRACT_SCALAR(c, r'$.text'), '\\n') as text
-        FROM
-          `%s.clinical_assertion` ca
-        LEFT JOIN UNNEST(ca.interpretation_comments) as c
-        WHERE ARRAY_LENGTH(ca.interpretation_comments) > 0
-        GROUP BY
-          id
-      ),
-      scv AS (
-        SELECT 
-          ca.release_date as release_date,
-          ca.id, 
-          ca.version, 
-          ca.variation_id,
-          ca.local_key,
-          ca.interpretation_date_last_evaluated as last_evaluated, 
-          cvs.rank,
-          ca.review_status, 
-          cst.clinvar_prop_type as clinvar_stmt_type,
-          cst.cvc_prop_type as cvc_stmt_type,
-          ca.interpretation_description as submitted_classification,
-          IFNULL(map.cv_clinsig_type, '-') as classif_type,
-          cst.significance,
-          ca.submitter_id,
-          ca.submission_id,
-          ca.clinical_assertion_observation_ids,
-          `clinvar_ingest.parseComments`(ca.content) as comments,
-          scc.text as classification_comment,
-          ca.date_created,
-          ca.date_last_updated
-        FROM
-          `%s.clinical_assertion` ca
-        LEFT JOIN scv_classification_comment scc 
-        ON
-          scc.id = ca.id
-        LEFT JOIN `clinvar_ingest.scv_clinsig_map` map
-        ON 
-          map.scv_term = lower(IF(ca.id IS NULL, NULL, IFNULL(ca.interpretation_description,'not provided')))
-        LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cst 
-        ON 
-          cst.code = map.cv_clinsig_type
-        LEFT JOIN `clinvar_ingest.clinvar_status` cvs
-        ON
-          cvs.label = ca.review_status
-      )
-      SELECT
-        scv.release_date,
-        scv.id, 
-        scv.version, 
-        scv.variation_id,
-        scv.local_key,
-        scv.last_evaluated, 
-        scv.rank,
-        scv.review_status, 
-        scv.clinvar_stmt_type,
-        scv.cvc_stmt_type,
-        scv.submitted_classification,
-        scv.classif_type,
-        scv.significance,
-        scv.comments,
-        scv.classification_comment,
-        scv.date_created,
-        scv.date_last_updated,
-        scv.clinical_assertion_observation_ids,
-        scv.submitter_id,
-        subm.submission_date, 
-        obs.origin,
-        obs.affected_status,
-        obs.method_desc,
-        obs.method_type,
-        am.value as assertion_method,
-        am.url as assertion_method_url
-      FROM scv
-      LEFT JOIN obs
-      ON
-        obs.id = scv.id
-      LEFT JOIN assertion_method am
-      ON
-        am.id = scv.id
-      LEFT JOIN `%s.submission` subm 
-      ON 
-        subm.id = scv.submission_id
-    """, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name, rec.schema_name);
-
-  END FOR;
-
-END;
-
-
--- repair any null review_status values
---  set any NULL review_status either 'no assertion provided' (if interp_desc is null or not provided) or 'no assertion criteria provided' (otherwise)
---
--- UPDATE `%s.clinical_assertion` ca
--- SET 
---   ca.review_status = 
---     IF( 
---       ca.interpretation_description is null OR 
---       ca.interpretation_description = 'not provided', 
---       'no assertion provided', 
---       'no assertion criteria provided'
---     )
--- WHERE ca.review_status IS NULL
--- ;
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql b/scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql
new file mode 100644
index 0000000..36d3c72
--- /dev/null
+++ b/scripts/ds-staging-table-procs/01-validate-ds-terms-v1-proc.sql
@@ -0,0 +1,51 @@
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.validate_dataset_terms_v1`(
+  schema_name STRING
+)
+BEGIN
+  -- Declare variables to hold results and error messages
+  DECLARE scv_classification_terms ARRAY<STRING>;
+  DECLARE scv_review_status_terms ARRAY<STRING>;
+  DECLARE combined_issues STRING;
+
+  -- Check for new interpretation_descriptions in clinical_assertion
+  EXECUTE IMMEDIATE FORMAT("""
+      SELECT ARRAY_AGG(DISTINCT ca.interpretation_description)
+      FROM `%s.clinical_assertion` ca
+      LEFT JOIN `clinvar_ingest.scv_clinsig_map` map
+      ON map.scv_term = LOWER(IFNULL(ca.interpretation_description,'not provided'))
+      WHERE map.scv_term IS NULL
+  """, schema_name) INTO scv_classification_terms;
+
+  -- Check for new review_status terms in clinical_assertion
+  EXECUTE IMMEDIATE FORMAT("""
+      SELECT ARRAY_AGG(DISTINCT ca.review_status)
+      FROM `%s.clinical_assertion` ca
+      LEFT JOIN `clinvar_ingest.clinvar_status` cs
+      ON cs.label = LOWER(ca.review_status)
+      WHERE cs.label IS NULL
+  """, schema_name) INTO scv_review_status_terms;
+
+  -- Construct a combined error message if there are issues
+  SET combined_issues = '';
+
+  IF scv_classification_terms IS NOT NULL AND ARRAY_LENGTH(scv_classification_terms) > 0 THEN
+    SET combined_issues = FORMAT("""
+      New SCV classification terms found: [%s].
+      NOTE: Add scv_clinsig_map records to the '00-setup-translation-tables.sql' script and update, then rerun this script.
+    """, ARRAY_TO_STRING(scv_classification_terms, ', '));
+  END IF;
+
+  IF scv_review_status_terms IS NOT NULL AND ARRAY_LENGTH(scv_review_status_terms) > 0 THEN
+    SET combined_issues = FORMAT("""
+      %s
+      New SCV review status terms found: [%s].
+      NOTE: Add clinvar_status records to the '00-setup-translation-tables.sql' script and update, then rerun this script.
+    """, combined_issues, ARRAY_TO_STRING(scv_review_status_terms, ', '));
+  END IF;
+
+  -- Raise a single exception if there are any issues
+  IF combined_issues != '' THEN
+    RAISE USING message = combined_issues;
+  END IF;
+
+END;
diff --git a/scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql b/scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql
new file mode 100644
index 0000000..6da57dd
--- /dev/null
+++ b/scripts/ds-staging-table-procs/02-normalize-ds-v2-proc.sql
@@ -0,0 +1,192 @@
+
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.normalize_dataset_v2`(
+  schema_name STRING   -- Name of schema/dataset
+)
+BEGIN
+  DECLARE column_exists BOOL;
+  DECLARE table_exists BOOL;
+
+  -- TABLE 1. Clinical Assertion
+  -- check for clinical_assertion.statement_type column as THE indicator that the dataset has been normalized to v2
+  CALL `clinvar_ingest.check_column_exists`(schema_name, 'clinical_assertion', 'statement_type', column_exists);
+
+  -- if the column does not exist, add it with the default value
+  IF NOT column_exists THEN
+    -- backup the original clinical_assertion table
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE TABLE `%s.backup_clinical_assertion` AS 
+      SELECT * FROM `%s.clinical_assertion`
+    """, schema_name, schema_name);
+
+    
+    -- create or replace the clinical_assertion table from the backup
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE OR REPLACE TABLE `%s.clinical_assertion` AS
+      SELECT 
+        *,
+        'GermlineClassification' as statement_type,
+        CAST(NULL as STRING) as clinical_impact_assertion_type,
+        CAST(NULL as STRING) as clinical_impact_clinical_significance
+      FROM `%s.backup_clinical_assertion`
+    """, schema_name, schema_name);  
+  END IF;
+
+  -- TABLE 2. RCV Accession & RCV Accession Classification (with corrections for v2 rcv_accession_classification.agg_classification column)
+  -- check that the rcv_accession_classification table exists as THE indicator that the dataset has been normalized to v2
+  CALL `clinvar_ingest.check_table_exists`(schema_name, 'rcv_accession_classification', table_exists);
+
+  -- if the table does not exist, create it
+  IF NOT table_exists THEN
+    -- backup the original rcv_accession table
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE TABLE `%s.backup_rcv_accession` AS 
+      SELECT * FROM `%s.rcv_accession`
+    """, schema_name, schema_name);
+
+    -- create the rcv_accession_classification table from the backup
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE TABLE %s.rcv_accession_classification AS
+      SELECT
+        release_date,
+        id as rcv_id,
+        'GermlineClassification' AS statement_type,
+        review_status,
+        [
+          STRUCT(
+            submission_count as num_submissions,
+            date_last_evaluated,
+            interpretation as interp_description,
+            CAST(NULL as STRING) as clinical_impact_assertion_type,
+            CAST(NULL as STRING) as clinical_impact_clinical_significance
+          )
+        ] as agg_classification        
+      FROM `%s.backup_rcv_accession`
+    """, schema_name, schema_name);
+
+    -- create or replace the rcv_accession_classification table from the backup
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE OR REPLACE TABLE `%s.rcv_accession` AS
+      SELECT 
+        release_date,
+        id,
+        variation_id,
+        independent_observations,
+        variation_archive_id,
+        version,
+        title,
+        trait_set_id,
+        content
+      FROM `%s.backup_rcv_accession`
+    """, schema_name, schema_name);
+
+  ELSE
+    -- if the table exists, check if the agg_classification column exists
+    CALL `clinvar_ingest.check_column_exists`(schema_name, 'rcv_accession_classification', 'agg_classification', column_exists);
+
+    -- if the column does not exist, convert the v2 table to the final format
+    IF NOT column_exists THEN
+      -- backup the original rcv_accession_classification table
+      EXECUTE IMMEDIATE FORMAT("""
+        CREATE TABLE `%s.backup_rcv_accession_classification` AS 
+        SELECT * FROM `%s.rcv_accession_classification`
+      """, schema_name, schema_name);
+
+      -- create or replace the rcv_accession_classification table from the backup
+      EXECUTE IMMEDIATE FORMAT("""
+        CREATE OR REPLACE TABLE `%s.rcv_accession_classification` AS
+        SELECT
+          release_date,
+          rcv_id,
+          statement_type,
+          review_status,
+          `clinvar_ingest.parseAggDescription`(content).description as agg_classification,
+        IF(
+          REGEXP_CONTAINS(content, r'^{\\s*"Description"\\s*\\:\\s*"[^"]+"\\s*}'),
+            NULL, 
+            REGEXP_REPLACE(content, r'"Description"\\s*\\:\\s*"[^"]+"\\s*,*\\s*', "")
+        ) as content
+        FROM `%s.rcv_accession_classification`
+        WHERE content is not null
+        UNION ALL
+        SELECT 
+        release_date,
+        rcv_id,
+        statement_type,
+        review_status,
+        [
+          STRUCT(
+            clinical_impact_assertion_type,
+            clinical_impact_clinical_significance,
+            date_last_evaluated,
+            num_submissions,
+            interp_description
+          )
+        ] as agg_classification,
+        content
+        FROM `%s.rcv_accession_classification`
+        WHERE content is null
+      """, schema_name, schema_name, schema_name);
+    END IF;    
+  END IF;
+
+  -- TABLE 3. Variation Archive & Variation Archive Classification
+  -- check that the variation_archive_classification table exists as THE indicator that the dataset has been normalized to v2
+  CALL `clinvar_ingest.check_table_exists`(schema_name, 'variation_archive_classification', table_exists);
+
+  -- if the table does not exist, create it
+  IF NOT table_exists THEN
+    -- backup the original variation_archive table
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE TABLE `%s.backup_variation_archive` AS 
+      SELECT * FROM `%s.variation_archive`
+    """, schema_name, schema_name);
+
+    -- create the variation_archive_classification table from the backup
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE TABLE %s.variation_archive_classification AS
+      SELECT
+        id as vcv_id,
+        'GermlineClassification' AS statement_type,
+        review_status,
+        num_submitters,
+        num_submissions,
+        date_created,
+        interp_date_last_evaluated,
+        interp_description,
+        interp_explanation,
+        CAST(JSON_VALUE(REPLACE(content, '@MostRecentSubmission', 'MostRecentSubmission'), '$.MostRecentSubmission') AS DATE) AS most_recent_submission,
+        interp_content as content,
+        CAST(NULL as STRING) as clinical_impact_assertion_type,
+        CAST(NULL as STRING) as clinical_impact_clinical_significance
+      FROM `%s.backup_variation_archive`
+    """, schema_name, schema_name);
+
+    -- create or replace the variation_archive table from the backup
+    EXECUTE IMMEDIATE FORMAT("""
+      CREATE OR REPLACE TABLE `%s.variation_archive` AS
+      SELECT 
+        date_created,
+        record_status,
+        variation_id,
+        release_date,
+        IF(
+          REGEXP_CONTAINS(content, r'^{\\s*"@MostRecentSubmission"\\s*\\:\\s*"[^"]+"\\s*}'),
+            NULL, 
+            REGEXP_REPLACE(content, r'"@MostRecentSubmission"\\s*\\:\\s*"[^"]+"\\s*,*\\s*', "")
+        ) as content,
+        species,
+        id,
+        version,
+        num_submitters,
+        date_last_updated,
+        num_submissions
+      FROM `%s.backup_variation_archive`
+    """, schema_name, schema_name);
+  END IF;
+
+END;
+
+-- tested on older set and it added the col and updated the values to the default.
+-- CALL `clinvar_ingest.normalize_dataset`('clinvar_2023_01_07_v1_6_57')
+-- tested on newer set and no col was added or updated
+-- CALL `clinvar_ingest.normalize_dataset`('clinvar_2024_11_26_v2_0_1_alpha')
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql b/scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql
deleted file mode 100644
index 68b82a4..0000000
--- a/scripts/ds-staging-table-procs/02-single-gene-variation-proc.sql
+++ /dev/null
@@ -1,348 +0,0 @@
--- see below for helper / setup scripts
-CREATE OR REPLACE PROCEDURE `clinvar_ingest.single_gene_variation_proc`(start_with DATE)
-BEGIN
-
-  FOR rec IN (select s.schema_name, s.release_date, s.prev_release_date, s.next_release_date FROM clinvar_ingest.schemas_on_or_after(start_with) as s)
-  DO
-
-    -- single gene variation  plan
-    --- step 1. create a table with columns variation_id, gene_id, somatic_flag 
-    ---         where the variation_id is the pk. 
-    EXECUTE IMMEDIATE FORMAT("""
-      CREATE OR REPLACE TABLE `%s.single_gene_variation`
-          ( release_date DATE NOT NULL,
-            variation_id STRING NOT NULL,
-            gene_id STRING NOT NULL,
-            relationship_type STRING,
-            source STRING,
-            mane_select BOOL DEFAULT FALSE,
-            somatic BOOL DEFAULT FALSE)
-    """, rec.schema_name);
-
-    -- create a temp table that is the list of remaining variations so as to reduce the query cost of analyzing against the variation table.
-    EXECUTE IMMEDIATE FORMAT("""
-      CREATE TEMP TABLE _SESSION.temp_variation 
-      AS 
-      SELECT 
-        v.id, 
-        v.name,
-        v.descendant_ids,
-        v.subclass_type
-      FROM `%s.variation` v
-    """, rec.schema_name);
-
-
-      --- step 2. (Resolvable Gene Symbol in variation name takes precedence as "single gene for variant")
-      ---         initialize the set with the extracted variation name gene symbols and the associated relationship info if available
-
-    -- prioritize mane select transcripts in title of variant
-    EXECUTE IMMEDIATE FORMAT("""
-      INSERT INTO `%s.single_gene_variation` 
-        (release_date, variation_id, gene_id, relationship_type, source, mane_select )
-      SELECT DISTINCT
-        %T as release_date,
-        v.id as variation_id,
-        mane.gene_id,
-        IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, 
-        IFNULL(ga.source,'cvc calculated') as source, 
-        TRUE as mane_select
-      FROM `clinvar_ingest.mane_select_gene_transcript` mane
-      JOIN _SESSION.temp_variation v on STARTS_WITH(v.name, mane.transcript_id)
-      JOIN `clinvar_ingest.entrez_gene` g on g.gene_id = mane.gene_id
-      LEFT JOIN `%s.gene_association` ga on ga.gene_id = g.gene_id and ga.variation_id = v.id
-    """, rec.schema_name, rec.release_date, rec.schema_name);
-
-      -- create a temp table that is the list of gene associations 
-      --   for variations not yet in the single gene var table and 
-      --   don't have the gene symbol ending in an -AS# suffix
-    EXECUTE IMMEDIATE FORMAT("""
-      CREATE TEMP TABLE _SESSION.temp_gene_assoc 
-      AS 
-      SELECT DISTINCT
-        ga.variation_id, 
-        ga.gene_id, 
-        ga.relationship_type, 
-        ga.source
-      FROM `%s.gene_association` ga
-      JOIN `clinvar_ingest.entrez_gene` g on ga.gene_id = g.gene_id
-      WHERE 
-        NOT REGEXP_CONTAINS(g.symbol_from_authority, r'\\-AS\\d$') AND
-        NOT EXISTS (
-          SELECT sgv.variation_id
-          FROM `%s.single_gene_variation` sgv
-          WHERE sgv.variation_id = ga.variation_id 
-        )
-    """, rec.schema_name, rec.schema_name);
-
-      -- NOTE: clinvar has a handful of duplicate gene records that can change over time
-      --      the plan will be to test the results of loading the variation_single_gene 
-      --      table to verify that any variants with multiple genes are simply duplicates and 
-      --      either one can be removed without an issue.
-
-    -- clinvar perferred label hgvs-style format NM_0000.0(GENE):c.234... (not mane select but still in name)
-    EXECUTE IMMEDIATE FORMAT("""
-      INSERT INTO `%s.single_gene_variation` 
-        (release_date, variation_id, gene_id, relationship_type, source)
-      WITH x AS
-      (
-        SELECT 
-          v.id, 
-          v.name, 
-          REGEXP_EXTRACT(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') as symbol
-        FROM _SESSION.temp_variation v
-        WHERE REGEXP_CONTAINS(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') AND 
-          NOT EXISTS (
-            SELECT sgv.variation_id
-            FROM `%s.single_gene_variation` sgv
-            WHERE sgv.variation_id = v.id 
-          )
-      )
-      SELECT 
-        %T as release_date,
-        x.id as variation_id,
-        g.gene_id, 
-        IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, 
-        IFNULL(ga.source,'cvc calculated') as source
-      FROM x 
-      JOIN `clinvar_ingest.entrez_gene` g on UPPER(g.symbol_from_authority) = UPPER(x.symbol) AND NOT REGEXP_CONTAINS(x.symbol, r'\\-AS\\d$') 
-      LEFT JOIN _SESSION.temp_gene_assoc ga on ga.variation_id = x.id and ga.gene_id = g.gene_id
-    """, rec.schema_name, rec.schema_name, rec.release_date); 
-
-
-      -- star allele format, CYP2C19*10. 
-    EXECUTE IMMEDIATE FORMAT("""
-      INSERT INTO `%s.single_gene_variation` 
-        (release_date, variation_id, gene_id, relationship_type, source)
-      WITH x AS
-      (
-        SELECT 
-          v.id, 
-          v.name, 
-          REGEXP_EXTRACT(v.name,  r'^([A-Za-z0-9\\-]+)[\\*\\,]') as symbol
-        FROM _SESSION.temp_variation v
-        WHERE REGEXP_CONTAINS(v.name,  r'^([A-Za-z0-9\\-]+)[\\*\\,]') AND
-          NOT EXISTS (
-            SELECT sgv.variation_id
-            FROM `%s.single_gene_variation` sgv
-            WHERE sgv.variation_id = v.id 
-          )
-      )
-      SELECT 
-        %T as release_date,
-        x.id as variation_id, 
-        g.gene_id,
-        IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, 
-        IFNULL(ga.source,'cvc calculated') as source
-      FROM x 
-      JOIN `clinvar_ingest.entrez_gene` g ON UPPER(x.symbol) = UPPER(g.symbol_from_authority)
-      LEFT JOIN _SESSION.temp_gene_assoc ga ON x.id = ga.variation_id AND g.gene_id = ga.gene_id
-    """, rec.schema_name, rec.schema_name, rec.release_date);
-
-      --- step 3. for any variations remaining... load all variations with any 
-      ---         genes that are mapped one-to-one from the gene association table
-    EXECUTE IMMEDIATE FORMAT("""
-      INSERT INTO `%s.single_gene_variation` 
-        (release_date, variation_id, gene_id, relationship_type, source)
-      WITH x AS
-      (
-        SELECT 
-          v.id
-        FROM _SESSION.temp_variation v
-        WHERE 
-          NOT EXISTS (
-            SELECT sgv.variation_id
-            FROM `%s.single_gene_variation` sgv
-            WHERE sgv.variation_id = v.id 
-          )
-      )
-      SELECT 
-        %T as release_date,
-        ga.variation_id, 
-        STRING_AGG(ga.gene_id), 
-        STRING_AGG(ga.relationship_type), 
-        STRING_AGG(ga.source)
-      FROM x
-      JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id
-      group by ga.variation_id
-      having count(distinct ga.gene_id) = 1
-    """, rec.schema_name, rec.schema_name, rec.release_date);
-
-      --- step 4. for any variations remaining... load any variant with one submitted gene that 
-      ---          is not either "genes overlapped by variant" or "asserted, but not computed"
-    EXECUTE IMMEDIATE FORMAT("""
-      INSERT INTO `%s.single_gene_variation` 
-        (release_date, variation_id, gene_id, relationship_type, source )
-      WITH x AS
-      (
-        SELECT 
-          v.id
-        FROM _SESSION.temp_variation v
-        WHERE 
-          NOT EXISTS (
-            SELECT sgv.variation_id
-            FROM `%s.single_gene_variation` sgv
-            WHERE sgv.variation_id = v.id 
-          )
-      )
-      SELECT 
-        %T as release_date,
-        ga.variation_id, 
-        STRING_AGG(ga.gene_id), 
-        STRING_AGG(ga.relationship_type), 
-        STRING_AGG(ga.source)
-      FROM x
-      JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id
-      WHERE 
-        ga.relationship_type not in ('genes overlapped by variant' , 'asserted, but not computed') AND
-        ga.source = 'submitted'
-      GROUP BY ga.variation_id
-      HAVING count(distinct ga.gene_id) = 1
-    """, rec.schema_name, rec.schema_name, rec.release_date);
-
-      --- step 5. for any variations remaining... load any variations with a "within single gene" 
-      ---         as long as it is associated to only one gene for that variant
-    EXECUTE IMMEDIATE FORMAT("""
-      INSERT INTO `%s.single_gene_variation` 
-        (release_date, variation_id, gene_id, relationship_type, source)
-      WITH x AS
-      (
-        SELECT 
-          v.id
-        FROM _SESSION.temp_variation v
-        WHERE 
-          NOT EXISTS (
-            SELECT sgv.variation_id
-            FROM `%s.single_gene_variation` sgv
-            WHERE sgv.variation_id = v.id 
-          )
-      )
-      SELECT 
-        %T as release_date,
-        ga.variation_id, 
-        STRING_AGG(ga.gene_id), 
-        STRING_AGG(ga.relationship_type), 
-        STRING_AGG(ga.source)
-      FROM x
-      JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id
-      WHERE (ga.relationship_type = 'within single gene')
-      GROUP BY ga.variation_id
-      having count(ga.gene_id) = 1
-    """, rec.schema_name, rec.schema_name, rec.release_date);
-
-      --- last step. for any variations remaining... load any haplotype or genotype variations only if all the children have the same gene_id
-    EXECUTE IMMEDIATE FORMAT("""
-      INSERT INTO `%s.single_gene_variation` 
-        (release_date, variation_id, gene_id, relationship_type, source )
-      WITH x AS
-      (
-        SELECT 
-          v.id
-        FROM _SESSION.temp_variation v
-        WHERE 
-          NOT EXISTS (
-            SELECT sgv.variation_id
-            FROM `%s.single_gene_variation` sgv
-            WHERE sgv.variation_id = v.id 
-          )
-      )
-      SELECT 
-        %T as release_date, 
-        v.id as variation_id, 
-        STRING_AGG(ga.gene_id) as gene_id, 
-        'association not provided by clinvar' as relationship_type, 
-        'cvc calculated' as source
-      FROM x 
-      JOIN _SESSION.temp_variation v ON x.id = v.id
-      CROSS JOIN UNNEST(v.descendant_ids) AS descendant_id
-      JOIN _SESSION.temp_variation d ON 
-          d.id = descendant_id AND 
-          d.subclass_type='SimpleAllele'
-      LEFT JOIN `%s.gene_association` ga on ga.variation_id = d.id
-      WHERE ARRAY_LENGTH(v.descendant_ids) > 0 
-      GROUP BY v.id
-      HAVING COUNT(ga.gene_id) = 1
-    """, rec.schema_name, rec.schema_name, rec.release_date, rec.schema_name);
-
-      --- Finally, update somatic flags based on current onco-gene list (should this be a 
-      --- 'change over time' capture of the onco-gene list's state?)
-    EXECUTE IMMEDIATE FORMAT("""
-      UPDATE `%s.single_gene_variation` sgv
-      SET sgv.somatic = TRUE
-      WHERE EXISTS (
-        SELECT cg.hgnc_id from `clinvar_ingest.cancer_genes` cg
-        join `clinvar_ingest.entrez_gene` g on g.hgnc_id = cg.hgnc_id
-        WHERE g.gene_id = sgv.gene_id
-      )
-    """, rec.schema_name);
-
-    DROP TABLE _SESSION.temp_variation;
-    DROP TABLE _SESSION.temp_gene_assoc;
-
-  END FOR;
-
-END;
-
-
-
--- -- PRE 2019-07-01 data setup one-off
--- -- remove duplicates from contrived gene_association table in pre-2019-07-01 data
--- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.gene_association`
--- AS
--- SELECT x.release_date, x.gene_id, x.variation_id, x.relationship_type, x.source
--- FROM (
---   SELECT
---       release_date, gene_id, variation_id, relationship_type, source,
---       ROW_NUMBER()
---           OVER (PARTITION BY release_date, gene_id, variation_id)
---           row_number
---   FROM `clinvar_2019_06_01_v0.gene_association`
--- ) x
--- WHERE x.row_number = 1
--- ;
-
--- -- contrive a single_gene_variation table for the pre-2019-07-01 data
--- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.single_gene_variation`
--- AS
--- WITH x AS
--- (
---   select release_date, variation_id
---   from  `clinvar_2019_06_01_v0.gene_association`
---   group by release_date, variation_id
---   having count(*) = 1
--- )
--- SELECT 
---   x.release_date,
---   ga.gene_id,
---   x.variation_id, 
---   ga.relationship_type,
---   ga.source,
---   (cg.hgnc_id IS NOT NULL) as somatic,
---   FALSE as mane_select
--- FROM x
--- join `clinvar_2019_06_01_v0.gene_association` ga on x.release_date = ga.release_date and x.variation_id = ga.variation_id
--- left join `clinvar_2019_06_01_v0.gene` g on g.id = ga.gene_id and x.release_date = g.release_date
--- left join `clinvar_ingest.cancer_genes` cg on g.hgnc_id = cg.hgnc_id
--- ;
-
-
-  -- -- validate between steps?...
-  -- select vsg.* 
-  -- from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg
-  -- join ( select vsg2.variation_id 
-  --        from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg2 
-  --        group by vsg2.variation_id having count(distinct vsg2.gene_id) > 1) vsg2 on  vsg.variation_id = vsg2.variation_id
-  -- order by 1,3;
-
-
-    -- -- Helper: To Find duplicate gene ids in release
-    -- select count(distinct ga.variation_id), ga.gene_id, g.symbol, g.hgnc_id
-    -- from `clinvar_2022_05_17_v1_6_46.gene_association` ga, 
-    --      `clinvar_2022_05_17_v1_6_46.gene` g,
-    --      (SELECT ARRAY_AGG(id) as gene_ids 
-    --        from `clinvar_2022_05_17_v1_6_46.gene` g 
-    --         group by hgnc_id, symbol, release_date, full_name 
-    --         having count(distinct id)>1 ) as gx,
-    --      UNNEST(gx.gene_ids) as gid
-    -- WHERE gid = ga.gene_id and g.id = ga.gene_id
-    -- group by ga.gene_id, g.symbol, g.hgnc_id
-    -- order by 3,1;
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql b/scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql
new file mode 100644
index 0000000..33c290c
--- /dev/null
+++ b/scripts/ds-staging-table-procs/03-scv-summary-v1-proc.sql
@@ -0,0 +1,158 @@
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary_v1_proc`(
+  schema_name STRING
+)
+BEGIN
+  EXECUTE IMMEDIATE FORMAT("""
+    CREATE OR REPLACE TABLE `%s.scv_summary` AS
+    WITH obs_sample AS (
+      SELECT
+        REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, 
+        `clinvar_ingest.parseSample`(obs.content) as s
+      FROM 
+        `%s.clinical_assertion_observation` obs
+    ),
+    obs_method AS (
+      SELECT
+        REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, 
+        m.description as method_desc,
+        m.method_type
+      FROM 
+        `%s.clinical_assertion_observation` obs,
+        UNNEST(`clinvar_ingest.parseMethods`(obs.content)) as m          
+      
+    ),
+    assertion_method AS (
+      SELECT 
+        ca.id, 
+        a.attribute.type,
+        a.attribute.value,
+        STRING_AGG(DISTINCT c.url,';') as url 
+      FROM `%s.clinical_assertion` ca,
+      UNNEST(`clinvar_ingest.parseAttributeSet`(ca.content)) as a,
+      UNNEST(a.citation) as c
+      WHERE 
+        a.attribute.type = "AssertionMethod" 
+        and 
+        c.url is not null
+      GROUP BY 
+        ca.id, 
+        a.attribute.type, 
+        a.attribute.value
+    ),
+    obs AS (
+      SELECT 
+        ca.id,
+        STRING_AGG(DISTINCT os.s.origin, ", " ORDER BY os.s.origin) as origin,
+        STRING_AGG(DISTINCT os.s.affected_status, ", " ORDER BY os.s.affected_status) as affected_status,
+        STRING_AGG(DISTINCT om.method_desc, ", " ORDER BY om.method_desc) as method_desc,
+        STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type
+      FROM 
+        `%s.clinical_assertion` ca
+      LEFT JOIN obs_sample os ON os.id = ca.id
+      LEFT JOIN obs_method om ON om.id = ca.id
+      GROUP BY
+        ca.id
+    ),
+    scv_classification_comment AS (
+      SELECT
+        id,
+        STRING_AGG(JSON_EXTRACT_SCALAR(c, r'$.text'), '\\n') as text
+      FROM
+        `%s.clinical_assertion` ca
+      LEFT JOIN UNNEST(ca.interpretation_comments) as c
+      WHERE ARRAY_LENGTH(ca.interpretation_comments) > 0
+      GROUP BY
+        id
+    ),
+    scv AS (
+      SELECT 
+        ca.release_date as release_date,
+        ca.id, 
+        ca.version, 
+        ca.variation_id,
+        ca.local_key,
+        ca.interpretation_date_last_evaluated as last_evaluated, 
+        cvs.rank,
+        ca.review_status, 
+        cst.clinvar_prop_type as clinvar_stmt_type,
+        cst.cvc_prop_type as cvc_stmt_type,
+        ca.interpretation_description as submitted_classification,
+        IFNULL(map.cv_clinsig_type, '-') as classif_type,
+        cst.significance,
+        ca.submitter_id,
+        ca.submission_id,
+        ca.clinical_assertion_observation_ids,
+        `clinvar_ingest.parseComments`(ca.content) as comments,
+        scc.text as classification_comment,
+        ca.date_created,
+        ca.date_last_updated
+      FROM
+        `%s.clinical_assertion` ca
+      LEFT JOIN scv_classification_comment scc 
+      ON
+        scc.id = ca.id
+      LEFT JOIN `clinvar_ingest.scv_clinsig_map` map
+      ON 
+        map.scv_term = lower(IF(ca.id IS NULL, NULL, IFNULL(ca.interpretation_description,'not provided')))
+      LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cst 
+      ON 
+        cst.code = map.cv_clinsig_type
+      LEFT JOIN `clinvar_ingest.clinvar_status` cvs
+      ON
+        cvs.label = ca.review_status
+    )
+    SELECT
+      scv.release_date,
+      scv.id, 
+      scv.version, 
+      scv.variation_id,
+      scv.local_key,
+      scv.last_evaluated, 
+      scv.rank,
+      scv.review_status, 
+      scv.clinvar_stmt_type,
+      scv.cvc_stmt_type,
+      scv.submitted_classification,
+      scv.classif_type,
+      scv.significance,
+      scv.comments,
+      scv.classification_comment,
+      scv.date_created,
+      scv.date_last_updated,
+      scv.clinical_assertion_observation_ids,
+      scv.submitter_id,
+      subm.submission_date, 
+      obs.origin,
+      obs.affected_status,
+      obs.method_desc,
+      obs.method_type,
+      am.value as assertion_method,
+      am.url as assertion_method_url
+    FROM scv
+    LEFT JOIN obs
+    ON
+      obs.id = scv.id
+    LEFT JOIN assertion_method am
+    ON
+      am.id = scv.id
+    LEFT JOIN `%s.submission` subm 
+    ON 
+      subm.id = scv.submission_id
+  """, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name);
+END;
+
+
+-- repair any null review_status values
+--  set any NULL review_status either 'no assertion provided' (if interp_desc is null or not provided) or 'no assertion criteria provided' (otherwise)
+--
+-- UPDATE `%s.clinical_assertion` ca
+-- SET 
+--   ca.review_status = 
+--     IF( 
+--       ca.interpretation_description is null OR 
+--       ca.interpretation_description = 'not provided', 
+--       'no assertion provided', 
+--       'no assertion criteria provided'
+--     )
+-- WHERE ca.review_status IS NULL
+-- ;
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql b/scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql
new file mode 100644
index 0000000..381d0a5
--- /dev/null
+++ b/scripts/ds-staging-table-procs/03-scv-summary-v2-proc.sql
@@ -0,0 +1,134 @@
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.scv_summary_v2_proc`(
+  schema_name STRING
+)
+BEGIN
+  EXECUTE IMMEDIATE FORMAT("""
+    CREATE OR REPLACE TABLE `%s.scv_summary` AS
+    WITH obs_sample AS (
+      SELECT
+        REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, 
+        `clinvar_ingest.parseSample`(obs.content) as s
+      FROM 
+        `%s.clinical_assertion_observation` obs
+    ),
+    obs_method AS (
+      SELECT
+        REGEXP_EXTRACT(id, r'^SCV[0-9]+') as id, 
+        m.description as method_desc,
+        m.method_type
+      FROM 
+        `%s.clinical_assertion_observation` obs,
+        UNNEST(`clinvar_ingest.parseMethods`(obs.content)) as m
+    ),
+    assertion_method AS (
+      SELECT 
+        ca.id, 
+        a.attribute.type,
+        a.attribute.value,
+        STRING_AGG(DISTINCT c.url,';') as url 
+      FROM `%s.clinical_assertion` ca,
+      UNNEST(`clinvar_ingest.parseAttributeSet`(ca.content)) as a,
+      UNNEST(a.citation) as c
+      WHERE 
+        a.attribute.type = "AssertionMethod" 
+        and 
+        c.url is not null
+      GROUP BY 
+        ca.id, 
+        a.attribute.type, 
+        a.attribute.value
+    ),
+    obs AS (
+      SELECT 
+        ca.id,
+        STRING_AGG(DISTINCT os.s.origin, ", " ORDER BY os.s.origin) as origin,
+        STRING_AGG(DISTINCT os.s.affected_status, ", " ORDER BY os.s.affected_status) as affected_status,
+        STRING_AGG(DISTINCT om.method_desc, ", " ORDER BY om.method_desc) as method_desc,
+        STRING_AGG(DISTINCT om.method_type, ", " ORDER BY om.method_type) as method_type
+      FROM 
+        `%s.clinical_assertion` ca
+      LEFT JOIN obs_sample os ON os.id = ca.id
+      LEFT JOIN obs_method om ON om.id = ca.id
+      GROUP BY
+        ca.id
+    ),
+    scv_classification_comment AS (
+      SELECT
+        id,
+        STRING_AGG(JSON_EXTRACT_SCALAR(c, r'$.text'), '\\n') as text
+      FROM
+        `%s.clinical_assertion` ca
+      LEFT JOIN UNNEST(ca.interpretation_comments) as c
+      WHERE ARRAY_LENGTH(ca.interpretation_comments) > 0
+      GROUP BY
+        id
+    )
+    SELECT 
+      ca.release_date as release_date,
+      ca.id, 
+      ca.version, 
+      ca.variation_id,
+      ca.local_key,
+      ca.interpretation_date_last_evaluated as last_evaluated, 
+      ca.statement_type,
+      cvs.rank,
+      ca.review_status, 
+      cst.clinvar_prop_type as clinvar_stmt_type,
+      cst.cvc_prop_type as cvc_stmt_type,
+      ca.interpretation_description as submitted_classification,
+      IFNULL(map.cv_clinsig_type, '-') as classif_type,
+      cst.significance,
+      `clinvar_ingest.parseComments`(ca.content) as comments,
+      scc.text as classification_comment,
+      ca.date_created,
+      ca.date_last_updated,
+      ca.clinical_assertion_observation_ids,
+      ca.submitter_id,
+      subm.submission_date, 
+      obs.origin,
+      obs.affected_status,
+      obs.method_desc,
+      obs.method_type,
+      am.value as assertion_method,
+      am.url as assertion_method_url
+    FROM
+      `%s.clinical_assertion` ca
+    LEFT JOIN scv_classification_comment scc 
+    ON
+      scc.id = ca.id
+    LEFT JOIN `clinvar_ingest.scv_clinsig_map` map
+    ON 
+      map.scv_term = lower(IFNULL(ca.interpretation_description,'not provided'))
+    LEFT JOIN `clinvar_ingest.clinvar_clinsig_types` cst 
+    ON 
+      cst.code = map.cv_clinsig_type
+    LEFT JOIN `clinvar_ingest.clinvar_status` cvs
+    ON
+      cvs.label = ca.review_status
+    LEFT JOIN obs
+    ON
+      obs.id = ca.id
+    LEFT JOIN assertion_method am
+    ON
+      am.id = ca.id
+    LEFT JOIN `%s.submission` subm 
+    ON 
+    subm.id = ca.submission_id
+  """, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name, schema_name);
+END;
+
+
+-- repair any null review_status values
+--  set any NULL review_status either 'no assertion provided' (if interp_desc is null or not provided) or 'no assertion criteria provided' (otherwise)
+--
+-- UPDATE `%s.clinical_assertion` ca
+-- SET 
+--   ca.review_status = 
+--     IF( 
+--       ca.interpretation_description is null OR 
+--       ca.interpretation_description = 'not provided', 
+--       'no assertion provided', 
+--       'no assertion criteria provided'
+--     )
+-- WHERE ca.review_status IS NULL
+-- ;
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql b/scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql
new file mode 100644
index 0000000..02a87bd
--- /dev/null
+++ b/scripts/ds-staging-table-procs/04-single-gene-variation-v1-proc.sql
@@ -0,0 +1,363 @@
+-- see below for helper / setup scripts
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.single_gene_variation_v1`(
+  schema_name STRING,
+  release_date DATE
+)
+BEGIN
+  -- single gene variation  plan
+  --- step 1. create a table with columns variation_id, gene_id, somatic_flag 
+  ---         where the variation_id is the pk. 
+  EXECUTE IMMEDIATE FORMAT("""
+    CREATE OR REPLACE TABLE `%s.single_gene_variation`
+        ( release_date DATE NOT NULL,
+          variation_id STRING NOT NULL,
+          gene_id STRING NOT NULL,
+          relationship_type STRING,
+          source STRING,
+          mane_select BOOL DEFAULT FALSE,
+          somatic BOOL DEFAULT FALSE)
+  """, schema_name);
+
+  -- create a temp table that is the list of remaining variations so as to reduce the query cost of analyzing against the variation table.
+  EXECUTE IMMEDIATE FORMAT("""
+    CREATE TEMP TABLE _SESSION.temp_variation 
+    AS 
+    SELECT 
+      v.id, 
+      v.name,
+      v.descendant_ids,
+      v.subclass_type
+    FROM `%s.variation` v
+  """, schema_name);
+
+  --- step 2. (Resolvable Gene Symbol in variation name takes precedence as "single gene for variant")
+  ---         initialize the set with the extracted variation name gene symbols and the associated relationship info if available
+
+  -- prioritize mane select transcripts in title of variant
+  EXECUTE IMMEDIATE FORMAT("""
+    INSERT INTO `%s.single_gene_variation` (
+      release_date, 
+      variation_id, 
+      gene_id, 
+      relationship_type, 
+      source, 
+      mane_select 
+    )
+    SELECT DISTINCT
+      %T as release_date,
+      v.id as variation_id,
+      mane.gene_id,
+      IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, 
+      IFNULL(ga.source,'cvc calculated') as source, 
+      TRUE as mane_select
+    FROM `clinvar_ingest.mane_select_gene_transcript` mane
+    JOIN _SESSION.temp_variation v 
+    ON 
+      STARTS_WITH(v.name, mane.transcript_id)
+    JOIN `clinvar_ingest.entrez_gene` g 
+    ON 
+      g.gene_id = mane.gene_id
+    LEFT JOIN `%s.gene_association` ga 
+    ON 
+      ga.gene_id = g.gene_id 
+      and 
+      ga.variation_id = v.id
+  """, schema_name, release_date, schema_name);
+
+    -- create a temp table that is the list of gene associations 
+    --   for variations not yet in the single gene var table and 
+    --   don't have the gene symbol ending in an -AS# suffix
+  EXECUTE IMMEDIATE FORMAT("""
+    CREATE TEMP TABLE _SESSION.temp_gene_assoc 
+    AS 
+    SELECT DISTINCT
+      ga.variation_id, 
+      ga.gene_id, 
+      ga.relationship_type, 
+      ga.source
+    FROM `%s.gene_association` ga
+    JOIN `clinvar_ingest.entrez_gene` g 
+    ON 
+      ga.gene_id = g.gene_id
+    WHERE 
+      NOT REGEXP_CONTAINS(g.symbol_from_authority, r'\\-AS\\d$') 
+      AND
+      NOT EXISTS (
+        SELECT 
+          sgv.variation_id
+        FROM `%s.single_gene_variation` sgv
+        WHERE 
+          sgv.variation_id = ga.variation_id 
+      )
+  """, schema_name, schema_name);
+
+    -- NOTE: clinvar has a handful of duplicate gene records that can change over time
+    --      the plan will be to test the results of loading the variation_single_gene 
+    --      table to verify that any variants with multiple genes are simply duplicates and 
+    --      either one can be removed without an issue.
+
+  -- clinvar perferred label hgvs-style format NM_0000.0(GENE):c.234... (not mane select but still in name)
+  EXECUTE IMMEDIATE FORMAT("""
+    INSERT INTO `%s.single_gene_variation` 
+      (release_date, variation_id, gene_id, relationship_type, source)
+    WITH x AS
+    (
+      SELECT 
+        v.id, 
+        v.name, 
+        REGEXP_EXTRACT(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') as symbol
+      FROM _SESSION.temp_variation v
+      WHERE REGEXP_CONTAINS(v.name, r'^N[A-Z]_[0-9]+\\.[0-9]+\\(([A-Za-z0-9\\-]+)\\)') AND 
+        NOT EXISTS (
+          SELECT sgv.variation_id
+          FROM `%s.single_gene_variation` sgv
+          WHERE sgv.variation_id = v.id 
+        )
+    )
+    SELECT 
+      %T as release_date,
+      x.id as variation_id,
+      g.gene_id, 
+      IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, 
+      IFNULL(ga.source,'cvc calculated') as source
+    FROM x 
+    JOIN `clinvar_ingest.entrez_gene` g on UPPER(g.symbol_from_authority) = UPPER(x.symbol) AND NOT REGEXP_CONTAINS(x.symbol, r'\\-AS\\d$') 
+    LEFT JOIN _SESSION.temp_gene_assoc ga on ga.variation_id = x.id and ga.gene_id = g.gene_id
+  """, schema_name, schema_name, release_date); 
+
+
+    -- star allele format, CYP2C19*10. 
+  EXECUTE IMMEDIATE FORMAT("""
+    INSERT INTO `%s.single_gene_variation` 
+      (release_date, variation_id, gene_id, relationship_type, source)
+    WITH x AS
+    (
+      SELECT 
+        v.id, 
+        v.name, 
+        REGEXP_EXTRACT(v.name,  r'^([A-Za-z0-9\\-]+)[\\*\\,]') as symbol
+      FROM _SESSION.temp_variation v
+      WHERE REGEXP_CONTAINS(v.name,  r'^([A-Za-z0-9\\-]+)[\\*\\,]') AND
+        NOT EXISTS (
+          SELECT sgv.variation_id
+          FROM `%s.single_gene_variation` sgv
+          WHERE sgv.variation_id = v.id 
+        )
+    )
+    SELECT 
+      %T as release_date,
+      x.id as variation_id, 
+      g.gene_id,
+      IFNULL(ga.relationship_type,'named gene not associated') as relationship_type, 
+      IFNULL(ga.source,'cvc calculated') as source
+    FROM x 
+    JOIN `clinvar_ingest.entrez_gene` g ON UPPER(x.symbol) = UPPER(g.symbol_from_authority)
+    LEFT JOIN _SESSION.temp_gene_assoc ga ON x.id = ga.variation_id AND g.gene_id = ga.gene_id
+  """, schema_name, schema_name, release_date);
+
+    --- step 3. for any variations remaining... load all variations with any 
+    ---         genes that are mapped one-to-one from the gene association table
+  EXECUTE IMMEDIATE FORMAT("""
+    INSERT INTO `%s.single_gene_variation` 
+      (release_date, variation_id, gene_id, relationship_type, source)
+    WITH x AS
+    (
+      SELECT 
+        v.id
+      FROM _SESSION.temp_variation v
+      WHERE 
+        NOT EXISTS (
+          SELECT sgv.variation_id
+          FROM `%s.single_gene_variation` sgv
+          WHERE sgv.variation_id = v.id 
+        )
+    )
+    SELECT 
+      %T as release_date,
+      ga.variation_id, 
+      STRING_AGG(ga.gene_id), 
+      STRING_AGG(ga.relationship_type), 
+      STRING_AGG(ga.source)
+    FROM x
+    JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id
+    group by ga.variation_id
+    having count(distinct ga.gene_id) = 1
+  """, schema_name, schema_name, release_date);
+
+    --- step 4. for any variations remaining... load any variant with one submitted gene that 
+    ---          is not either "genes overlapped by variant" or "asserted, but not computed"
+  EXECUTE IMMEDIATE FORMAT("""
+    INSERT INTO `%s.single_gene_variation` 
+      (release_date, variation_id, gene_id, relationship_type, source )
+    WITH x AS
+    (
+      SELECT 
+        v.id
+      FROM _SESSION.temp_variation v
+      WHERE 
+        NOT EXISTS (
+          SELECT sgv.variation_id
+          FROM `%s.single_gene_variation` sgv
+          WHERE sgv.variation_id = v.id 
+        )
+    )
+    SELECT 
+      %T as release_date,
+      ga.variation_id, 
+      STRING_AGG(ga.gene_id), 
+      STRING_AGG(ga.relationship_type), 
+      STRING_AGG(ga.source)
+    FROM x
+    JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id
+    WHERE 
+      ga.relationship_type not in ('genes overlapped by variant' , 'asserted, but not computed') AND
+      ga.source = 'submitted'
+    GROUP BY ga.variation_id
+    HAVING count(distinct ga.gene_id) = 1
+  """, schema_name, schema_name, release_date);
+
+    --- step 5. for any variations remaining... load any variations with a "within single gene" 
+    ---         as long as it is associated to only one gene for that variant
+  EXECUTE IMMEDIATE FORMAT("""
+    INSERT INTO `%s.single_gene_variation` 
+      (release_date, variation_id, gene_id, relationship_type, source)
+    WITH x AS
+    (
+      SELECT 
+        v.id
+      FROM _SESSION.temp_variation v
+      WHERE 
+        NOT EXISTS (
+          SELECT sgv.variation_id
+          FROM `%s.single_gene_variation` sgv
+          WHERE sgv.variation_id = v.id 
+        )
+    )
+    SELECT 
+      %T as release_date,
+      ga.variation_id, 
+      STRING_AGG(ga.gene_id), 
+      STRING_AGG(ga.relationship_type), 
+      STRING_AGG(ga.source)
+    FROM x
+    JOIN _SESSION.temp_gene_assoc ga on x.id = ga.variation_id
+    WHERE (ga.relationship_type = 'within single gene')
+    GROUP BY ga.variation_id
+    having count(ga.gene_id) = 1
+  """, schema_name, schema_name, release_date);
+
+    --- last step. for any variations remaining... load any haplotype or genotype variations only if all the children have the same gene_id
+  EXECUTE IMMEDIATE FORMAT("""
+    INSERT INTO `%s.single_gene_variation` 
+      (release_date, variation_id, gene_id, relationship_type, source )
+    WITH x AS
+    (
+      SELECT 
+        v.id
+      FROM _SESSION.temp_variation v
+      WHERE 
+        NOT EXISTS (
+          SELECT sgv.variation_id
+          FROM `%s.single_gene_variation` sgv
+          WHERE sgv.variation_id = v.id 
+        )
+    )
+    SELECT 
+      %T as release_date, 
+      v.id as variation_id, 
+      STRING_AGG(ga.gene_id) as gene_id, 
+      'association not provided by clinvar' as relationship_type, 
+      'cvc calculated' as source
+    FROM x 
+    JOIN _SESSION.temp_variation v ON x.id = v.id
+    CROSS JOIN UNNEST(v.descendant_ids) AS descendant_id
+    JOIN _SESSION.temp_variation d ON 
+        d.id = descendant_id AND 
+        d.subclass_type='SimpleAllele'
+    LEFT JOIN `%s.gene_association` ga on ga.variation_id = d.id
+    WHERE ARRAY_LENGTH(v.descendant_ids) > 0 
+    GROUP BY v.id
+    HAVING COUNT(ga.gene_id) = 1
+  """, schema_name, schema_name, release_date, schema_name);
+
+    --- Finally, update somatic flags based on current onco-gene list (should this be a 
+    --- 'change over time' capture of the onco-gene list's state?)
+  EXECUTE IMMEDIATE FORMAT("""
+    UPDATE `%s.single_gene_variation` sgv
+    SET sgv.somatic = TRUE
+    WHERE EXISTS (
+      SELECT cg.hgnc_id from `clinvar_ingest.cancer_genes` cg
+      join `clinvar_ingest.entrez_gene` g on g.hgnc_id = cg.hgnc_id
+      WHERE g.gene_id = sgv.gene_id
+    )
+  """, schema_name);
+
+  DROP TABLE _SESSION.temp_variation;
+  DROP TABLE _SESSION.temp_gene_assoc;
+
+END;
+
+
+
+-- -- PRE 2019-07-01 data setup one-off
+-- -- remove duplicates from contrived gene_association table in pre-2019-07-01 data
+-- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.gene_association`
+-- AS
+-- SELECT x.release_date, x.gene_id, x.variation_id, x.relationship_type, x.source
+-- FROM (
+--   SELECT
+--       release_date, gene_id, variation_id, relationship_type, source,
+--       ROW_NUMBER()
+--           OVER (PARTITION BY release_date, gene_id, variation_id)
+--           row_number
+--   FROM `clinvar_2019_06_01_v0.gene_association`
+-- ) x
+-- WHERE x.row_number = 1
+-- ;
+
+-- -- contrive a single_gene_variation table for the pre-2019-07-01 data
+-- CREATE OR REPLACE TABLE `clinvar_2019_06_01_v0.single_gene_variation`
+-- AS
+-- WITH x AS
+-- (
+--   select release_date, variation_id
+--   from  `clinvar_2019_06_01_v0.gene_association`
+--   group by release_date, variation_id
+--   having count(*) = 1
+-- )
+-- SELECT 
+--   x.release_date,
+--   ga.gene_id,
+--   x.variation_id, 
+--   ga.relationship_type,
+--   ga.source,
+--   (cg.hgnc_id IS NOT NULL) as somatic,
+--   FALSE as mane_select
+-- FROM x
+-- join `clinvar_2019_06_01_v0.gene_association` ga on x.release_date = ga.release_date and x.variation_id = ga.variation_id
+-- left join `clinvar_2019_06_01_v0.gene` g on g.id = ga.gene_id and x.release_date = g.release_date
+-- left join `clinvar_ingest.cancer_genes` cg on g.hgnc_id = cg.hgnc_id
+-- ;
+
+
+  -- -- validate between steps?...
+  -- select vsg.* 
+  -- from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg
+  -- join ( select vsg2.variation_id 
+  --        from `clinvar_2022_05_17_v1_6_46.single_gene_variation` vsg2 
+  --        group by vsg2.variation_id having count(distinct vsg2.gene_id) > 1) vsg2 on  vsg.variation_id = vsg2.variation_id
+  -- order by 1,3;
+
+
+    -- -- Helper: To Find duplicate gene ids in release
+    -- select count(distinct ga.variation_id), ga.gene_id, g.symbol, g.hgnc_id
+    -- from `clinvar_2022_05_17_v1_6_46.gene_association` ga, 
+    --      `clinvar_2022_05_17_v1_6_46.gene` g,
+    --      (SELECT ARRAY_AGG(id) as gene_ids 
+    --        from `clinvar_2022_05_17_v1_6_46.gene` g 
+    --         group by hgnc_id, symbol, release_date, full_name 
+    --         having count(distinct id)>1 ) as gx,
+    --      UNNEST(gx.gene_ids) as gid
+    -- WHERE gid = ga.gene_id and g.id = ga.gene_id
+    -- group by ga.gene_id, g.symbol, g.hgnc_id
+    -- order by 3,1;
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/03-gc-scv-proc.sql b/scripts/ds-staging-table-procs/05-gc-scv-v1-proc.sql
similarity index 85%
rename from scripts/ds-staging-table-procs/03-gc-scv-proc.sql
rename to scripts/ds-staging-table-procs/05-gc-scv-v1-proc.sql
index c00ebd5..8f0c1b5 100644
--- a/scripts/ds-staging-table-procs/03-gc-scv-proc.sql
+++ b/scripts/ds-staging-table-procs/05-gc-scv-v1-proc.sql
@@ -1,8 +1,7 @@
-CREATE OR REPLACE PROCEDURE
-  `clinvar_ingest.gc_scv_proc`(start_with DATE)
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.gc_scv_v1`(
+  schema_name STRING
+)
 BEGIN
-  FOR rec IN (SELECT s.schema_name, s.release_date, s.prev_release_date, s.next_release_date FROM clinvar_ingest.schemas_on_or_after(start_with) AS s)
-  DO
     EXECUTE IMMEDIATE FORMAT("""
       CREATE OR REPLACE TABLE `%s.gc_scv`
       AS
@@ -29,13 +28,15 @@ BEGIN
             SELECT 
               od.attribute.value
             FROM UNNEST(`clinvar_ingest.parseObservedData`(cao.content)) od 
-            WHERE od.attribute.type = 'SampleLocalID'
+            WHERE 
+              od.attribute.type = 'SampleLocalID'
           ) as sample_id,
           (
             SELECT 
               od.attribute.value
             FROM UNNEST(`clinvar_ingest.parseObservedData`(cao.content)) od 
-            WHERE od.attribute.type = 'SampleVariantID'
+            WHERE 
+              od.attribute.type = 'SampleVariantID'
           ) as sample_variant_id,
           cao.id as scv_obs_id
         FROM `variation_tracker.report_submitter` rs
@@ -73,6 +74,5 @@ BEGIN
         lower(cct.label) = lower(tc.lab.classification)
       WHERE 
         IFNULL(tc.lab.name, IFNULL(tc.lab.id,tc.lab.classification)) IS NOT NULL
-      """, rec.schema_name, rec.schema_name, rec.schema_name);
-  END FOR;
+      """, schema_name, schema_name, schema_name);
 END;
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql b/scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql
new file mode 100644
index 0000000..0183aa8
--- /dev/null
+++ b/scripts/ds-staging-table-procs/dataset-preparation-v1-proc.sql
@@ -0,0 +1,20 @@
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.dataset_preparation_v1`(
+  on_date DATE
+)
+BEGIN
+  FOR rec IN (
+    select 
+      s.schema_name, 
+      s.release_date, 
+      s.prev_release_date, 
+      s.next_release_date 
+    FROM clinvar_ingest.schema_on_v2(on_date) as s
+  )
+  DO
+    CALL `clinvar_ingest.validate_dataset_terms_v1`(rec.schema_name);
+    CALL `clinvar_ingest.scv_summary_v1`(rec.schema_name);
+    CALL `clinvar_ingest.single_gene_variation_v1`(rec.schema_name, rec.release_date);
+    CALL `clinvar_ingest.gc_scv_v1`(rec.schema_name, rec.release_date);
+  END FOR;
+
+END;
\ No newline at end of file
diff --git a/scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql b/scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql
new file mode 100644
index 0000000..7b5ae47
--- /dev/null
+++ b/scripts/ds-staging-table-procs/dataset-preparation-v2-proc.sql
@@ -0,0 +1,21 @@
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.dataset_preparation_v2`(
+  on_date DATE
+)
+BEGIN
+  FOR rec IN (
+    select 
+      s.schema_name, 
+      s.release_date, 
+      s.prev_release_date, 
+      s.next_release_date 
+    FROM clinvar_ingest.schema_on_v2(on_date) as s
+  )
+  DO
+    CALL `clinvar_ingest.validate_dataset_v1`(rec.schema_name);
+    CALL `clinvar_ingest.normalize_dataset_v2`(rec.schema_name);
+    CALL `clinvar_ingest.scv_summary_v2`(rec.schema_name);
+    CALL `clinvar_ingest.single_gene_variation_v1`(rec.schema_name, rec.release_date);
+    CALL `clinvar_ingest.gc_scv_v1`(rec.schema_name);
+  END FOR;
+
+END;
\ No newline at end of file
diff --git a/scripts/general/schemas-on-or-after-v2-func.sql b/scripts/general/schemas-on-or-after-v2-func.sql
index a73b1ba..905b77f 100644
--- a/scripts/general/schemas-on-or-after-v2-func.sql
+++ b/scripts/general/schemas-on-or-after-v2-func.sql
@@ -14,4 +14,4 @@ AS (
   ORDER BY 2
 );
 
--- select * from `clinvar_ingest.schemas_on_or_after`(DATE('2020-06-01'));
\ No newline at end of file
+-- select * from `clinvar_ingest.schemas_on_or_after_v2`(DATE('2020-06-01'));
\ No newline at end of file
diff --git a/scripts/general/tables-columns-exists-proc.sql b/scripts/general/tables-columns-exists-proc.sql
new file mode 100644
index 0000000..4e69d92
--- /dev/null
+++ b/scripts/general/tables-columns-exists-proc.sql
@@ -0,0 +1,30 @@
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.check_table_exists`(
+    schema_name STRING,
+    table_name STRING,
+    OUT table_exists BOOL
+)
+BEGIN
+  EXECUTE IMMEDIATE FORMAT("""
+    SELECT EXISTS(
+      SELECT 1 
+      FROM `%s.INFORMATION_SCHEMA.TABLES`
+      WHERE table_name = '%s'
+    )
+  """, schema_name, table_name) INTO table_exists;
+END;
+
+CREATE OR REPLACE PROCEDURE `clinvar_ingest.check_column_exists`(
+    schema_name STRING,
+    table_name STRING,
+    column_name STRING,
+    OUT column_exists BOOL
+)
+BEGIN
+  EXECUTE IMMEDIATE FORMAT("""
+    SELECT EXISTS(
+      SELECT 1
+      FROM `%s.INFORMATION_SCHEMA.COLUMNS`
+      WHERE table_name = '%s' AND column_name = '%s'
+    )
+  """, schema_name, table_name, column_name) INTO column_exists;
+END;
\ No newline at end of file