diff --git a/scripts/dataset-preparation/old-xml-rcv-data-preparation/01-rcv-data-prep.sql b/scripts/dataset-preparation/old-xml-rcv-data-preparation/01-rcv-data-prep.sql new file mode 100644 index 0000000..dacdc58 --- /dev/null +++ b/scripts/dataset-preparation/old-xml-rcv-data-preparation/01-rcv-data-prep.sql @@ -0,0 +1,161 @@ +BEGIN + DECLARE sql STRING DEFAULT ''; + DECLARE mmyy_array ARRAY; + + SET mmyy_array = [ + '2023-01', + '2023-02', + '2023-03', + '2023-04', + '2023-05', + '2023-06', + '2023-07', + '2023-08', + '2023-09', + '2023-10', + '2023-11', + '2023-12', + '2024-01', + '2024-02']; + + -- Loop through the array + FOR rec IN (SELECT mmyy FROM UNNEST(mmyy_array) as mmyy) + DO + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `clingen-dev.clinvar_000.%s-rcv` + AS + SELECT + id, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.ClinVarAccession['@Acc']") as rcv_accession, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.ClinVarAccession['@Version']") as rcv_version, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.ClinicalSignificance.Description") as clinical_significance, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus") as review_status, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.MeasureSet['@Acc']") as vcv_id, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.MeasureSet['@Version']") as vcv_version, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.MeasureSet['@ID']") as variation_id, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.TraitSet['@ID']") as trait_set_id, + JSON_EXTRACT_SCALAR(content, "$.ClinVarSet.ReferenceClinVarAssertion.TraitSet['@Type']") as trait_set_type, + IF( + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(content, "$.ClinVarSet.ReferenceClinVarAssertion.TraitSet.Trait")) > 0, + JSON_EXTRACT_ARRAY(content, "$.ClinVarSet.ReferenceClinVarAssertion.TraitSet.Trait"), + [JSON_EXTRACT(content, "$.ClinVarSet.ReferenceClinVarAssertion.TraitSet.Trait")] + ) as trait_content, + IF( + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(content, "$.ClinVarSet.ClinVarAssertion")) > 0, + JSON_EXTRACT_ARRAY(content, "$.ClinVarSet.ClinVarAssertion"), + [JSON_EXTRACT(content, "$.ClinVarSet.ClinVarAssertion")] + ) as scv_content + FROM `clingen-dev.clinvar_000.%s-rcv-source` + """, rec.mmyy, rec.mmyy); + + EXECUTE IMMEDIATE FORMAT( + """ + CREATE OR REPLACE TABLE `clingen-dev.clinvar_000.%s-scv` + AS + SELECT + rcv.rcv_accession, + rcv.rcv_version, + JSON_EXTRACT_SCALAR(scv_content, "$.ClinVarAccession['@Acc']") as scv_id, + JSON_EXTRACT_SCALAR(scv_content, "$.ClinVarAccession['@Version']") as scv_version + FROM `clingen-dev.clinvar_000.%s-rcv` rcv + CROSS JOIN UNNEST(rcv.scv_content) as scv_content + """ + , rec.mmyy, rec.mmyy); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `clingen-dev.clinvar_000.%s-trait-extract` + AS + SELECT + rcv.rcv_accession, + rcv.rcv_version, + rcv.trait_set_id, + JSON_EXTRACT_SCALAR(trait_content, "$['@ID']") as trait_id, + JSON_EXTRACT_SCALAR(trait_content, "$['@Type']") as trait_type, + IF( + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(trait_content, "$.Name")) > 0, + JSON_EXTRACT_ARRAY(trait_content, "$.Name"), + IF(JSON_EXTRACT(trait_content, "$.Name") is NULL, NULL, [JSON_EXTRACT(trait_content, "$.Name")]) + ) as trait_name_content, + IF( + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(trait_content, "$.Symbol")) > 0, + JSON_EXTRACT_ARRAY(trait_content, "$.Symbol"), + IF(JSON_EXTRACT(trait_content, "$.Symbol") is NULL, NULL, [JSON_EXTRACT(trait_content, "$.Symbol")]) + ) as trait_symbol_content, + `clinvar_ingest.parseXRefs`(TO_JSON_STRING(trait_content)) as xrefs + FROM `clingen-dev.clinvar_000.%s-rcv` rcv + CROSS JOIN UNNEST(rcv.trait_content) as trait_content + """, rec.mmyy, rec.mmyy); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `clingen-dev.clinvar_000.%s-trait` + AS + WITH trait_name AS ( + SELECT DISTINCT + trait.trait_id, + JSON_EXTRACT_SCALAR(name_content, "$.ElementValue['#text']") as name + FROM `clingen-dev.clinvar_000.%s-trait-extract` trait + CROSS JOIN UNNEST(trait.trait_name_content) as name_content + WHERE + JSON_EXTRACT_SCALAR(name_content, "$.ElementValue['@Type']") = 'Preferred' + ), + trait_symbol AS ( + SELECT DISTINCT + trait.trait_id, + JSON_EXTRACT_SCALAR(symbol_content, "$.ElementValue['#text']") as symbol + FROM `clingen-dev.clinvar_000.%s-trait-extract` trait + LEFT JOIN UNNEST(trait.trait_symbol_content) as symbol_content + WHERE + JSON_EXTRACT_SCALAR(symbol_content, "$.ElementValue['@Type']") = 'Preferred' + ), + trait_xref_id AS ( + SELECT DISTINCT + trait.trait_id, + IF(xref.id like '%%:%%', LOWER(xref.id), IF(xref.db = 'OMIM', IF(xref.id LIKE 'PS%%', 'mimps:'||xref.id , 'mim:'||xref.id),LOWER(xref.db)||':'||xref.id)) as xref_id + FROM `clingen-dev.clinvar_000.%s-trait-extract` trait + LEFT JOIN UNNEST(trait.xrefs) as xref + WHERE IFNULL(xref.type,'') <> 'secondary' + ), + uniq_trait AS ( + SELECT + trait.trait_id + from `clingen-dev.clinvar_000.%s-trait-extract` trait + group by + trait.trait_id + ) + select + trait.trait_id, + trait_name.name, + trait_symbol.symbol, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT trait_xref_id.xref_id ORDER BY trait_xref_id.xref_id ASC),', ') as xref_ids + from uniq_trait trait + left join trait_name + on + trait_name.trait_id = trait.trait_id + left join trait_symbol + on + trait_symbol.trait_id = trait.trait_id + left join trait_xref_id + on + trait_xref_id.trait_id = trait.trait_id + group by + trait.trait_id, + trait_name.name, + trait_symbol.symbol + """, rec.mmyy, rec.mmyy, rec.mmyy, rec.mmyy, rec.mmyy); + + EXECUTE IMMEDIATE FORMAT(""" + CREATE OR REPLACE TABLE `clingen-dev.clinvar_000.%s-trait-set-mapping` + AS + SELECT + trait.trait_set_id, + trait.trait_id + from `clingen-dev.clinvar_000.%s-trait-extract` trait + group by + trait.trait_set_id, + trait.trait_id + """, rec.mmyy, rec.mmyy); + + END FOR; + +END; diff --git a/scripts/dataset-preparation/old-xml-rcv-data-preparation/02-rcv-data-prep-cleanup.sql b/scripts/dataset-preparation/old-xml-rcv-data-preparation/02-rcv-data-prep-cleanup.sql new file mode 100644 index 0000000..c0c78f1 --- /dev/null +++ b/scripts/dataset-preparation/old-xml-rcv-data-preparation/02-rcv-data-prep-cleanup.sql @@ -0,0 +1,1054 @@ +-- rcv-accession cleanup from DSP datasets 2023-01-07 thru 2024-01-07 +-- used 2023-01 for previous and 2023-02 for next month with datasets 12/31/2022: + -- clinvar_2023_01_07_v1_6_57 + -- clinvar_2023_01_15_v1_6_57 + -- clinvar_2023_01_21_v1_6_57 +-- used 2023-02 for previous and 2023-03 for next month with datasets 2/08/2023: + -- clinvar_2023_02_08_v1_6_57 + -- clinvar_2023_02_13_v1_6_57 + -- clinvar_2023_02_18_v1_6_57 +-- used 2023-03 for previous and 2023-04 for next month with datasets 2/26/2023: + -- clinvar_2023_02_26_v1_6_57 + -- clinvar_2023_03_06_v1_6_57 + -- clinvar_2023_03_11_v1_6_57 + -- clinvar_2023_03_18_v1_6_57 + -- clinvar_2023_03_26_v1_6_57 +-- used 2023-04 for previous and 2023-05 for next month with datasets 4/4/2023: + -- clinvar_2023_04_04_v1_6_58 + -- clinvar_2023_04_10_v1_6_58 + -- clinvar_2023_04_16_v1_6_58 + -- clinvar_2023_04_24_v1_6_58 +-- used 2023-05 for previous and 2023-06 for next month with datasets 4/30/2023: + -- clinvar_2023_04_30_v1_6_58 + -- clinvar_2023_05_07_v1_6_59 + -- clinvar_2023_05_14_v1_6_59 + -- clinvar_2023_05_20_v1_6_60 +-- used 2023-06 for previous and 2023-07 for next month with datasets 5/27/2023: + -- clinvar_2023_05_27_v1_6_60 + -- clinvar_2023_06_04_v1_6_60 + -- clinvar_2023_06_10_v1_6_60 + -- clinvar_2023_06_17_v1_6_60 + -- clinvar_2023_06_26_v1_6_60 +-- used 2023-07 for previous and 2023-08 for next month with datasets 7/02/2023: + -- clinvar_2023_07_02_v1_6_60 + -- clinvar_2023_07_10_v1_6_60 + -- clinvar_2023_07_17_v1_6_60 + -- clinvar_2023_07_22_v1_6_60 +-- used 2023-08 for previous and 2023-09 for next month with datasets 2023-07-30 : + -- clinvar_2023_07_30_v1_6_60 + -- clinvar_2023_08_06_v1_6_61 + -- clinvar_2023_08_13_v1_6_61 + -- clinvar_2023_08_19_v1_6_61 + -- clinvar_2023_08_26_v1_6_61 +-- used 2023-09 for previous and 2023-10 for next month with datasets 2023-09-03 : + -- clinvar_2023_09_03_v1_6_61 + -- clinvar_2023_09_10_v1_6_61 + -- clinvar_2023_09_17_v1_6_61 + -- clinvar_2023_09_23_v1_6_61 +-- used 2023-10 for previous and 2023-11 for next month with datasets 2023-09-30 : + -- clinvar_2023_09_30_v1_6_61 + -- clinvar_2023_10_07_v1_6_61 + -- clinvar_2023_10_15_v1_6_61 + -- clinvar_2023_10_21_v1_6_61 +-- used 2023-11 for previous and 2023-12 for next month with datasets 2023-10-28 : + -- clinvar_2023_10_28_v1_6_61 + -- clinvar_2023_11_04_v1_6_61 + -- clinvar_2023_11_12_v1_6_61 + -- clinvar_2023_11_21_v1_6_61 + -- clinvar_2023_11_26_v1_6_61 +-- used 2023-12 for previous and 2024-01 for next month with datasets 2023-12-03 : + -- clinvar_2023_12_03_v1_6_61 + -- clinvar_2023_12_09_v1_6_61 + -- clinvar_2023_12_17_v1_6_61 + -- clinvar_2023_12_26_v1_6_61 +-- used 2024-01 for previous and 2024-02 for next month with datasets 2023-12-30 : + -- clinvar_2023_12_30_v1_6_61 + -- clinvar_2024_01_07_v1_6_61 +-- + +-- backup rcv_accession +create table `clinvar_2024_01_07_v1_6_61.backup_rcv_accession_2` +as +select + * +from `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` +; + +update `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` +set trait_set_id = NULL +where true +; + +update `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` rcv +set + rcv.trait_set_id = ( + SELECT + old_rcv.trait_set_id + FROM `clingen-dev.clinvar_000.2024-01-rcv` old_rcv + WHERE + rcv.id = old_rcv.rcv_accession + ) +where + rcv.trait_set_id is null +; + +update `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` rcv +set + rcv.trait_set_id = ( + SELECT + old_rcv.trait_set_id + FROM `clingen-dev.clinvar_000.2024-02-rcv` old_rcv + WHERE + rcv.id = old_rcv.rcv_accession + ) +where + rcv.trait_set_id is null +; + +--remaining nulls get repopulated from original DSP method +update `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` rcv +set + rcv.trait_set_id = ( + SELECT + bu_rcv.trait_set_id + FROM `clinvar_2024_01_07_v1_6_61.backup_rcv_accession_2` bu_rcv + WHERE + bu_rcv.id = rcv.id + ) +where + rcv.trait_set_id is null +; + +-- VERIFY that NOTHING IS LEFT UNMAPPED!@@are there any null trait_set_ids left? +select + rcv.id, + rcv.variation_archive_id, + rcv.trait_set_id, + bu_rcv.trait_set_id +from `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` rcv +left join `clinvar_2024_01_07_v1_6_61.backup_rcv_accession_2` bu_rcv +on + bu_rcv.id = rcv.id +where + rcv.trait_set_id is null +order by 1 +; + +-- backup clinical_assertion, +create table `clinvar_2024_01_07_v1_6_61.backup_clinical_assertion_2` +as +select + * +from `clingen-dev.clinvar_2024_01_07_v1_6_61.clinical_assertion` +; + +-- clear out rcv_accession_id and trait_set_id +update `clingen-dev.clinvar_2024_01_07_v1_6_61.clinical_assertion` +set + rcv_accession_id = null, + trait_set_id = null +where + true +; + +-- update the rcv_accession_id with the previous month's rcv data based on last_updated_date +update `clingen-dev.clinvar_2024_01_07_v1_6_61.clinical_assertion` scv +set + rcv_accession_id = ( + select + old_scv.rcv_accession + from `clingen-dev.clinvar_000.2024-01-scv` old_scv + join `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` rcv + on + rcv.id = old_scv.rcv_accession + where + old_scv.scv_id = scv.id + and + scv.date_last_updated <= DATE'2023-12-30' + ) +where + rcv_accession_id is null +; +-- 2,666,170 updated +-- 2,668,077 updated +-- 2,670,944 updated +-- 3,257,231 updated +-- 3,278,301 updated +-- 3,281,124 +-- 3,281,295 +-- 3,308,102 +-- +--3,315,748 +-- 3,316,032 +-- 3,323,381 +-- 3,300,468 + +-- update the remaining rcv_accession_ids with the next month's rcv data based on scv id alone, verfiy rcvs are valid for this release +update `clingen-dev.clinvar_2024_01_07_v1_6_61.clinical_assertion` scv +set + rcv_accession_id = ( + select + old_scv.rcv_accession + from `clingen-dev.clinvar_000.2024-02-scv` old_scv + join `clingen-dev.clinvar_2024_01_07_v1_6_61.rcv_accession` rcv + on + rcv.id = old_scv.rcv_accession + where + old_scv.scv_id = scv.id + ) +where + rcv_accession_id is null +; + +select + scv.id +from `clingen-dev.clinvar_2024_01_07_v1_6_61.clinical_assertion` scv +where + scv.rcv_accession_id is null +; + +update `clingen-dev.clinvar_2024_01_07_v1_6_61.clinical_assertion` scv +set + scv.rcv_accession_id = ( + select + bu_scv.rcv_accession_id as orig_rcv + from `clingen-dev.clinvar_2024_01_07_v1_6_61.backup_clinical_assertion_2` bu_scv + where + scv.id = bu_scv.id + ) +where + scv.rcv_accession_id is null; +-- LOG of updates from statement directly above ! +-- 6 updated in 4/10/2023 dataset +-- id +-- SCV003852633 +-- SCV003852640 +-- SCV003852630 +-- SCV003852637 +-- SCV003852639 +-- SCV003852632 + +-- 14 updated in 5/14/2023 dataset +-- id +-- SCV003922381 +-- SCV003924062 +-- SCV003924088 +-- SCV003924090 +-- SCV003922379 +-- SCV003922384 +-- SCV000025456 +-- SCV003922380 +-- SCV003922377 +-- SCV003922387 +-- SCV003924065 +-- SCV003924061 +-- SCV003922383 +-- SCV003922382 + +-- 4 updated in 5/20/2023 dataset +-- id +-- SCV003925572 +-- SCV003925573 +-- SCV003925574 +-- SCV003924098 + +-- 24 updated in 6/4/2023 dataset +-- id +-- SCV003928201 +-- SCV001448338 +-- SCV000569204 +-- SCV003928710 +-- SCV003928448 +-- SCV003928548 +-- SCV000695151 +-- SCV003928930 +-- SCV000052842 +-- SCV003927178 +-- SCV003928263 +-- SCV003928453 +-- SCV003929400 +-- SCV003926644 +-- SCV003928200 +-- SCV001713986 +-- SCV000571948 +-- SCV001136949 +-- SCV000691706 +-- SCV003928199 +-- SCV003926814 +-- SCV001765850 +-- SCV001713960 +-- SCV003928446 + +-- 36 updated in 6/10/2023 dataset +-- id +-- SCV000569204 +-- SCV003928448 +-- SCV003929400 +-- SCV003928200 +-- SCV000695151 +-- SCV003928548 +-- SCV000565546 +-- SCV000210022 +-- SCV000033528 +-- SCV000570889 +-- SCV003928710 +-- SCV003926814 +-- SCV000611086 +-- SCV000583994 +-- SCV003928453 +-- SCV000691706 +-- SCV003929413 +-- SCV000052842 +-- SCV000572807 +-- SCV000210044 +-- SCV003928201 +-- SCV001471668 +-- SCV000571948 +-- SCV001136949 +-- SCV001815453 +-- SCV001765850 +-- SCV003928930 +-- SCV001448338 +-- SCV003927178 +-- SCV001471898 +-- SCV001713986 +-- SCV003928199 +-- SCV001713960 +-- SCV003926644 +-- SCV003928446 +-- SCV003928263 + +-- 43 updated in 6/17/2023 dataset +-- id +-- SCV000691706 +-- SCV000571948 +-- SCV000564787 +-- SCV000695151 +-- SCV000569204 +-- SCV000210022 +-- SCV000526367 +-- SCV003928263 +-- SCV000570889 +-- SCV003926814 +-- SCV000572807 +-- SCV000581705 +-- SCV001713960 +-- SCV001471898 +-- SCV001713986 +-- SCV000572796 +-- SCV000033528 +-- SCV001471668 +-- SCV003928930 +-- SCV003928448 +-- SCV000566363 +-- SCV001448338 +-- SCV000566285 +-- SCV000583994 +-- SCV001136949 +-- SCV003927178 +-- SCV003926644 +-- SCV003929400 +-- SCV000589353 +-- SCV001765850 +-- SCV000210044 +-- SCV003928548 +-- SCV003928446 +-- SCV000565546 +-- SCV003931804 +-- SCV000611086 +-- SCV000223217 +-- SCV001815453 +-- SCV003928710 +-- SCV000052842 +-- SCV000569748 +-- SCV000582974 +-- SCV003928453 + +-- 57 IN 6/26 +-- id +-- SCV000210794 +-- SCV000569748 +-- SCV001713986 +-- SCV003928930 +-- SCV000223217 +-- SCV000210044 +-- SCV000210022 +-- SCV003928453 +-- SCV000583994 +-- SCV000582974 +-- SCV000589353 +-- SCV000564787 +-- SCV003926644 +-- SCV003933886 +-- SCV000571756 +-- SCV003931804 +-- SCV001337998 +-- SCV001765850 +-- SCV000565546 +-- SCV001448338 +-- SCV000572796 +-- SCV000581705 +-- SCV003929400 +-- SCV000568248 +-- SCV003928448 +-- SCV003934699 +-- SCV000033528 +-- SCV000570889 +-- SCV003933902 +-- SCV000566285 +-- SCV001471898 +-- SCV000697011 +-- SCV000695151 +-- SCV001471668 +-- SCV003928710 +-- SCV003926814 +-- SCV000917499 +-- SCV000572807 +-- SCV000571948 +-- SCV003935057 +-- SCV000691706 +-- SCV000917222 +-- SCV000566363 +-- SCV000526367 +-- SCV003928263 +-- SCV001815453 +-- SCV003927178 +-- SCV000564577 +-- SCV000568068 +-- SCV001136949 +-- SCV000569204 +-- SCV000698912 +-- SCV000052842 +-- SCV003928446 +-- SCV000611086 +-- SCV001713960 +-- SCV003928548 + +-- 1 in 2023_07_10 +-- id +-- SCV003988617 + +-- 21 in 7/17 +-- id +-- SCV004012859 +-- SCV004012856 +-- SCV004012858 +-- SCV003988617 +-- SCV004012867 +-- SCV004012860 +-- SCV004012866 +-- SCV004012857 +-- SCV004012861 +-- SCV004012986 +-- SCV004012869 +-- SCV004012811 +-- SCV004012864 +-- SCV004012853 +-- SCV004012854 +-- SCV004012985 +-- SCV004012863 +-- SCV004012865 +-- SCV004012862 +-- SCV004012868 +-- SCV004012855 + +-- 2 in 7/22 +-- id +-- SCV003988617 +-- SCV004012811 + +-- 11 in 8/13 +-- id +-- SCV004024269 +-- SCV004024275 +-- SCV004024274 +-- SCV004024272 +-- SCV004024276 +-- SCV004024270 +-- SCV004023390 +-- SCV004024277 +-- SCV004024273 +-- SCV000040235 +-- SCV004024271 + +-- 3 in 8/19 +-- id +-- SCV004025936 +-- SCV004024744 +-- SCV004024569 + +-- 1 in 9/10 +-- id +-- SCV004031470 + + +-- 4 in 9/17 +-- id +-- SCV004035028 +-- SCV004035031 +-- SCV004035029 +-- SCV004031470 + + +-- 470 in the 10/21/2023 dataset? +-- id +-- SCV004044476 +-- SCV004045291 +-- SCV004043032 +-- SCV004043335 +-- SCV004045772 +-- SCV004043105 +-- SCV004044698 +-- SCV004044009 +-- SCV004044728 +-- SCV004045490 +-- SCV004043763 +-- SCV004045533 +-- SCV004045410 +-- SCV004043992 +-- SCV004045277 +-- SCV004044054 +-- SCV004045648 +-- SCV004045124 +-- SCV004042979 +-- SCV004044768 +-- SCV004042879 +-- SCV004043578 +-- SCV004045569 +-- SCV004043864 +-- SCV004044757 +-- SCV004044814 +-- SCV004043629 +-- SCV004044785 +-- SCV004045675 +-- SCV004045760 +-- SCV004043146 +-- SCV004044642 +-- SCV004043151 +-- SCV004044905 +-- SCV004044860 +-- SCV004043270 +-- SCV004044781 +-- SCV004044150 +-- SCV004043285 +-- SCV004045666 +-- SCV004044193 +-- SCV004045726 +-- SCV004044772 +-- SCV004044834 +-- SCV004043373 +-- SCV004043244 +-- SCV004043492 +-- SCV004045570 +-- SCV004044804 +-- SCV004044704 +-- SCV004045562 +-- SCV004045741 +-- SCV004045505 +-- SCV004045070 +-- SCV004045579 +-- SCV004045507 +-- SCV004044872 +-- SCV004045427 +-- SCV004043830 +-- SCV004044913 +-- SCV004045610 +-- SCV004044007 +-- SCV004043381 +-- SCV004045555 +-- SCV004045031 +-- SCV004045724 +-- SCV004045338 +-- SCV004045414 +-- SCV004045035 +-- SCV004044551 +-- SCV004044697 +-- SCV004045639 +-- SCV004043758 +-- SCV004045705 +-- SCV004043746 +-- SCV004044584 +-- SCV004045316 +-- SCV004043589 +-- SCV004045506 +-- SCV004043112 +-- SCV004044836 +-- SCV004042878 +-- SCV004042857 +-- SCV004043811 +-- SCV004045246 +-- SCV004043140 +-- SCV004043116 +-- SCV004044998 +-- SCV004044549 +-- SCV004044526 +-- SCV004043106 +-- SCV004045518 +-- SCV004043230 +-- SCV004044747 +-- SCV004044774 +-- SCV004043894 +-- SCV004043064 +-- SCV004045574 +-- SCV004044700 +-- SCV004044570 +-- SCV004045220 +-- SCV004043625 +-- SCV004045276 +-- SCV004045534 +-- SCV004044904 +-- SCV004043428 +-- SCV004043548 +-- SCV004043131 +-- SCV004044705 +-- SCV004044815 +-- SCV004044588 +-- SCV004044710 +-- SCV004045301 +-- SCV004043976 +-- SCV004043503 +-- SCV004044787 +-- SCV004044888 +-- SCV004044962 +-- SCV004044399 +-- SCV004044647 +-- SCV004044230 +-- SCV004044917 +-- SCV004045723 +-- SCV004043039 +-- SCV004044857 +-- SCV004043829 +-- SCV004044824 +-- SCV004044842 +-- SCV004045252 +-- SCV004043245 +-- SCV004045617 +-- SCV004044026 +-- SCV004044716 +-- SCV004043968 +-- SCV004044720 +-- SCV004045532 +-- SCV004044692 +-- SCV004044758 +-- SCV004045315 +-- SCV004045481 +-- SCV004044676 +-- SCV004043418 +-- SCV004043142 +-- SCV004043156 +-- SCV004043390 +-- SCV004045052 +-- SCV004043717 +-- SCV004044044 +-- SCV004043356 +-- SCV004044696 +-- SCV004043651 +-- SCV004044899 +-- SCV004043157 +-- SCV004045710 +-- SCV004045657 +-- SCV004044733 +-- SCV004045717 +-- SCV004045719 +-- SCV004045685 +-- SCV004045216 +-- SCV004044994 +-- SCV004045558 +-- SCV004044176 +-- SCV004044823 +-- SCV004044856 +-- SCV004043456 +-- SCV004045767 +-- SCV004043269 +-- SCV004044718 +-- SCV004044734 +-- SCV004045152 +-- SCV004043838 +-- SCV004044829 +-- SCV004043264 +-- SCV004044564 +-- SCV004042922 +-- SCV004045063 +-- SCV004043935 +-- SCV004044410 +-- SCV004045753 +-- SCV004043555 +-- SCV004043231 +-- SCV004045509 +-- SCV004044699 +-- SCV004044896 +-- SCV004043154 +-- SCV004045720 +-- SCV004043135 +-- SCV004045653 +-- SCV004045540 +-- SCV004045700 +-- SCV004044684 +-- SCV004045715 +-- SCV004043248 +-- SCV004045530 +-- SCV004043278 +-- SCV004044865 +-- SCV004044012 +-- SCV004045620 +-- SCV004044607 +-- SCV004044800 +-- SCV004044729 +-- SCV004045718 +-- SCV004045528 +-- SCV004044640 +-- SCV004044825 +-- SCV004044893 +-- SCV004045744 +-- SCV004045526 +-- SCV004043960 +-- SCV004044648 +-- SCV004044881 +-- SCV004045638 +-- SCV004044229 +-- SCV004044668 +-- SCV004044709 +-- SCV004044687 +-- SCV004044792 +-- SCV004045563 +-- SCV004045551 +-- SCV004044282 +-- SCV004043435 +-- SCV004045628 +-- SCV004043983 +-- SCV004045667 +-- SCV004042986 +-- SCV004045240 +-- SCV004043834 +-- SCV004045021 +-- SCV004045676 +-- SCV004045082 +-- SCV004043511 +-- SCV004042852 +-- SCV004045598 +-- SCV004045764 +-- SCV004045592 +-- SCV004045673 +-- SCV004045426 +-- SCV004045629 +-- SCV004044848 +-- SCV004043655 +-- SCV004043229 +-- SCV004045024 +-- SCV004042951 +-- SCV004045687 +-- SCV004043132 +-- SCV004044864 +-- SCV004043309 +-- SCV004045704 +-- SCV004045051 +-- SCV004045585 +-- SCV004045612 +-- SCV004045651 +-- SCV004044858 +-- SCV004043576 +-- SCV004043639 +-- SCV004042660 +-- SCV004044466 +-- SCV004044887 +-- SCV004043302 +-- SCV004044730 +-- SCV004044001 +-- SCV004045538 +-- SCV004044755 +-- SCV004044043 +-- SCV004043144 +-- SCV004044688 +-- SCV004045611 +-- SCV004043240 +-- SCV004044701 +-- SCV004045699 +-- SCV004045545 +-- SCV004045221 +-- SCV004045742 +-- SCV004043167 +-- SCV004045023 +-- SCV004045631 +-- SCV004045713 +-- SCV004042901 +-- SCV004043277 +-- SCV004044011 +-- SCV004043979 +-- SCV004044529 +-- SCV004045248 +-- SCV004045677 +-- SCV004044372 +-- SCV004043217 +-- SCV004045451 +-- SCV004043110 +-- SCV004043977 +-- SCV004044765 +-- SCV004045519 +-- SCV004043332 +-- SCV004045484 +-- SCV004043228 +-- SCV004043550 +-- SCV004045691 +-- SCV004043666 +-- SCV004044259 +-- SCV004045501 +-- SCV004044844 +-- SCV004045571 +-- SCV004044907 +-- SCV004043377 +-- SCV004045573 +-- SCV004044507 +-- SCV004044450 +-- SCV004045076 +-- SCV004045388 +-- SCV004045336 +-- SCV004045329 +-- SCV004045679 +-- SCV004044791 +-- SCV004043145 +-- SCV004044651 +-- SCV004044813 +-- SCV004044991 +-- SCV004044732 +-- SCV004044348 +-- SCV004045578 +-- SCV004045421 +-- SCV004044315 +-- SCV004044898 +-- SCV004044953 +-- SCV004043002 +-- SCV004043234 +-- SCV004044885 +-- SCV004043545 +-- SCV004042891 +-- SCV004045706 +-- SCV004045206 +-- SCV004043380 +-- SCV004044100 +-- SCV004044886 +-- SCV004044803 +-- SCV004045511 +-- SCV004043831 +-- SCV004045655 +-- SCV004044922 +-- SCV004045561 +-- SCV004043962 +-- SCV004043163 +-- SCV004043886 +-- SCV004044845 +-- SCV004044268 +-- SCV004045619 +-- SCV004045523 +-- SCV004045546 +-- SCV004043993 +-- SCV004045588 +-- SCV004043195 +-- SCV004045529 +-- SCV004044703 +-- SCV004045162 +-- SCV004043924 +-- SCV004044793 +-- SCV004044707 +-- SCV004043008 +-- SCV004045584 +-- SCV004044763 +-- SCV004043267 +-- SCV004044717 +-- SCV004043227 +-- SCV004043075 +-- SCV004044673 +-- SCV004045608 +-- SCV004043409 +-- SCV004044988 +-- SCV004043162 +-- SCV004043984 +-- SCV004045642 +-- SCV004045504 +-- SCV004044769 +-- SCV004045457 +-- SCV004043826 +-- SCV004044407 +-- SCV004043284 +-- SCV004045697 +-- SCV004044912 +-- SCV004044218 +-- SCV004043640 +-- SCV004045493 +-- SCV004044902 +-- SCV004045774 +-- SCV004045496 +-- SCV004044650 +-- SCV004045061 +-- SCV004045170 +-- SCV004043159 +-- SCV004045773 +-- SCV004045708 +-- SCV004043133 +-- SCV004045541 +-- SCV004045485 +-- SCV004045348 +-- SCV004044840 +-- SCV004045495 +-- SCV004044706 +-- SCV004044042 +-- SCV004044681 +-- SCV004044072 +-- SCV004044492 +-- SCV004043932 +-- SCV004043728 +-- SCV004045225 +-- SCV004042947 +-- SCV004043741 +-- SCV004044658 +-- SCV004043183 +-- SCV004045692 +-- SCV004043378 +-- SCV004042915 +-- SCV004045559 +-- SCV004043353 +-- SCV004045671 +-- SCV004045313 +-- SCV004043693 +-- SCV004045761 +-- SCV004044014 +-- SCV004045624 +-- SCV004044874 +-- SCV004045622 +-- SCV004043138 +-- SCV004045525 +-- SCV004045640 +-- SCV004045527 +-- SCV004045702 +-- SCV004043804 +-- SCV004045487 +-- SCV004044039 +-- SCV004044429 +-- SCV004043752 +-- SCV004044894 +-- SCV004044773 +-- SCV004043444 +-- SCV004045215 +-- SCV004042999 +-- SCV004043630 +-- SCV004043099 +-- SCV004043562 +-- SCV004045768 +-- SCV004042975 +-- SCV004044691 +-- SCV004045245 +-- SCV004043253 +-- SCV004042892 +-- SCV004044666 +-- SCV004044464 +-- SCV004044807 +-- SCV004045331 +-- SCV004044494 +-- SCV004043395 +-- SCV004045636 +-- SCV004042985 +-- SCV004044906 +-- SCV004044849 +-- SCV004043394 +-- SCV004043165 +-- SCV004045633 +-- SCV004045729 +-- SCV004044016 +-- SCV004044679 +-- SCV004043674 +-- SCV004045601 +-- SCV004043430 +-- SCV004043966 +-- SCV004045486 +-- SCV004045694 +-- SCV004044675 +-- SCV004045698 + + +-- 1 in 11/04/2023 +-- id +-- SCV004100726 + +-- 21 in 11/12/2023 +-- id +-- SCV003781255 +-- SCV001633134 +-- SCV003791275 +-- SCV001574949 +-- SCV001681777 +-- SCV001215435 +-- SCV003786421 +-- SCV001676762 +-- SCV002440679 +-- SCV002184227 +-- SCV001542497 +-- SCV003791273 +-- SCV001535533 +-- SCV002257692 +-- SCV001556049 +-- SCV003791274 +-- SCV001536277 +-- SCV003791277 +-- SCV001606778 +-- SCV004100726 +-- SCV002364382 + +-- 22 in 11/21/2023 +-- id +-- SCV002364382 +-- SCV003786421 +-- SCV002440679 +-- SCV001536277 +-- SCV002257692 +-- SCV001535533 +-- SCV001542497 +-- SCV001606778 +-- SCV002184227 +-- SCV001556049 +-- SCV003791275 +-- SCV001574949 +-- SCV001633134 +-- SCV001215435 +-- SCV004102701 +-- SCV003781255 +-- SCV001681777 +-- SCV003791277 +-- SCV003791273 +-- SCV001676762 +-- SCV004100726 +-- SCV003791274 + +-- 23 in 11/26/2023 +-- id +-- SCV001536277 +-- SCV003781255 +-- SCV001633134 +-- SCV004171042 +-- SCV002440679 +-- SCV003791274 +-- SCV002184227 +-- SCV004102701 +-- SCV002257692 +-- SCV001542497 +-- SCV001556049 +-- SCV001535533 +-- SCV004100726 +-- SCV003791275 +-- SCV002364382 +-- SCV001574949 +-- SCV003791277 +-- SCV001606778 +-- SCV001676762 +-- SCV001215435 +-- SCV001681777 +-- SCV003791273 +-- SCV003786421 + +-- 3 in 12/17/2023 +-- id +-- SCV004176760 +-- SCV004175904 +-- SCV004176758 + +-- 1 in 01/07/2024 +-- id +-- SCV004223867 \ No newline at end of file diff --git a/test/data/rcv-csv-load.sh b/scripts/dataset-preparation/old-xml-rcv-data-preparation/rcv-csv-load.sh similarity index 100% rename from test/data/rcv-csv-load.sh rename to scripts/dataset-preparation/old-xml-rcv-data-preparation/rcv-csv-load.sh diff --git a/test/data/rcv-old-test.xml b/scripts/dataset-preparation/old-xml-rcv-data-preparation/rcv-old-test.xml similarity index 100% rename from test/data/rcv-old-test.xml rename to scripts/dataset-preparation/old-xml-rcv-data-preparation/rcv-old-test.xml diff --git a/test/data/splitLargeFile.py b/scripts/dataset-preparation/old-xml-rcv-data-preparation/splitLargeFile.py similarity index 100% rename from test/data/splitLargeFile.py rename to scripts/dataset-preparation/old-xml-rcv-data-preparation/splitLargeFile.py diff --git a/scripts/temporal-data-collection/temporal-data-collection-proc.sql b/scripts/temporal-data-collection/temporal-data-collection-proc.sql index a6de1e0..2c056aa 100644 --- a/scripts/temporal-data-collection/temporal-data-collection-proc.sql +++ b/scripts/temporal-data-collection/temporal-data-collection-proc.sql @@ -44,8 +44,11 @@ BEGIN CALL `clinvar_ingest.clinvar_vcvs`(rec.schema_name, rec.release_date, rec.prev_release_date, single_call_result); SET all_processed_results = ARRAY_CONCAT(all_processed_results, [single_call_result]); + CALL `clinvar_ingest.clinvar_vcv_classifications`(rec.schema_name, rec.release_date, rec.prev_release_date, single_call_result); + SET all_processed_results = ARRAY_CONCAT(all_processed_results, [single_call_result]); + CALL `clinvar_ingest.clinvar_scvs`(rec.schema_name, rec.release_date, rec.prev_release_date, single_call_result); - SET all_processed_results = ARRAY_CONCAT(all_processed_results, [single_call_processed_results]); + SET all_processed_results = ARRAY_CONCAT(all_processed_results, [single_call_result]); CALL `clinvar_ingest.clinvar_gc_scvs`(rec.schema_name, rec.release_date, rec.prev_release_date, single_call_result); SET all_processed_results = ARRAY_CONCAT(all_processed_results, [single_call_result]); diff --git a/scripts/temporal-data-summation/03-voi-and-voi-scv-group-proc.sql b/scripts/temporal-data-summation/03-voi-and-voi-scv-group-proc.sql index 6bbb034..4cb9d34 100644 --- a/scripts/temporal-data-summation/03-voi-and-voi-scv-group-proc.sql +++ b/scripts/temporal-data-summation/03-voi-and-voi-scv-group-proc.sql @@ -236,7 +236,7 @@ BEGIN vs.classification_abbrev, vg.sig_type[OFFSET(vs.clinsig_type)].percent*100, vs.full_scv_id) as scv_label, - CASE vs.rpt_stmt_type + CASE vs.gks_proposition_type WHEN 'path' THEN CASE vs.clinsig_type WHEN 2 THEN '1-PLP' diff --git a/scripts/temporal-data-summation/04-voi-group-change-proc.sql b/scripts/temporal-data-summation/04-voi-group-change-proc.sql index 05ad0e8..6493872 100644 --- a/scripts/temporal-data-summation/04-voi-group-change-proc.sql +++ b/scripts/temporal-data-summation/04-voi-group-change-proc.sql @@ -127,7 +127,10 @@ BEGIN row_number () OVER ( ORDER BY st.variation_id, - st.rpt_stmt_type, + st.statement_type, + st.gks_proposition_type, + st.clinical_impact_assertion_type, + st.clinical_impact_clinical_significance, st.rank, st.start_release_date ASC NULLS FIRST ) as rownum @@ -144,9 +147,12 @@ BEGIN UNION DISTINCT SELECT MIN(r.release_date) as start_release_date, - variation_id, - rpt_stmt_type, - rank + vg.variation_id, + vg.statement_type, + vg.gks_proposition_type, + vg.clinical_impact_assertion_type, + vg.clinical_impact_clinical_significance, + vg.rank FROM `clinvar_ingest.voi_group` vg LEFT JOIN `clinvar_ingest.clinvar_releases` r ON @@ -185,23 +191,23 @@ BEGIN ) as rownum FROM ( SELECT - end_release_date, - variation_id, - statement_type, - gks_proposition_type, - clinical_impact_assertion_type, - clinical_impact_clinical_significance, - rank + vg.end_release_date, + vg.variation_id, + vg.statement_type, + vg.gks_proposition_type, + vg.clinical_impact_assertion_type, + vg.clinical_impact_clinical_significance, + vg.rank FROM `clinvar_ingest.voi_group` vg UNION DISTINCT SELECT MAX(r.release_date) as end_release_date, - variation_id, - statement_type, - gks_proposition_type, - clinical_impact_assertion_type, - clinical_impact_clinical_significance, - rank + vg.variation_id, + vg.statement_type, + vg.gks_proposition_type, + vg.clinical_impact_assertion_type, + vg.clinical_impact_clinical_significance, + vg.rank FROM `clinvar_ingest.voi_group` vg LEFT JOIN `clinvar_ingest.clinvar_releases` r ON diff --git a/test/data/parseRCV.py b/test/data/parseRCV.py deleted file mode 100644 index 2594635..0000000 --- a/test/data/parseRCV.py +++ /dev/null @@ -1,32 +0,0 @@ -import xml.etree.ElementTree as ET -import pandas as pd -import os - -# Load the XML file -xml_file = 'rcv-old-test.xml' - -if not os.path.exists(xml_file): - raise FileNotFoundError(f"The file '{xml_file}' does not exist. Please check the file path and try again.") - -# Parse the XML file -tree = ET.parse(xml_file) -root = tree.getroot() - -# Namespace if needed (update if the namespace changes) -namespace = {'xsi': 'http://www.w3.org/2001/XMLSchema-instance'} - -# Extract ClinVarSet nodes -records = [] -for clinvarset in root.findall('ClinVarSet', namespace): - record_id = clinvarset.attrib.get('ID') - record_content = ET.tostring(clinvarset, encoding='unicode', method='xml') - records.append({'id': record_id, 'content': record_content}) - -0# Convert to DataFrame -clinvar_df = pd.DataFrame(records) - -# Save to CSV for BigQuery upload -output_file = 'clinvarset_records.csv' -clinvar_df.to_csv(output_file, index=False, encoding='utf-8') - -print(f"Extracted {len(records)} records to {output_file}.") \ No newline at end of file