Skip to content

Commit

Permalink
started adding clinvar-curator scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
larrybabb committed Oct 7, 2024
1 parent 0e282c9 commit 278ac30
Show file tree
Hide file tree
Showing 9 changed files with 636 additions and 0 deletions.
189 changes: 189 additions & 0 deletions scripts/clinvar-curartion/cvc-annotations-as-of-func.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
CREATE OR REPLACE TABLE FUNCTION `clinvar_curator.cvc_annotations_as_of`(as_of_date DATE, include_finalized BOOL, only_latest BOOL) AS (
WITH anno AS
(
select
as_of_date,
release_date,
annotation_id,
-- variant and vcv
variation_id,
vcv_axn,
vcv_id,
vcv_ver,
-- scv
scv_id,
scv_ver,
-- annotation assessment record
curator,
annotated_on,
annotated_date,
annotated_time_utc,
action,
reason,
notes,
submitter_id,
annotation_label,
is_latest,
-- finalized rev info
finalized_review_id,
finalized_reviewer,
finalized_review_status,
finalized_review_notes,
-- finalized submission batch info
finalized_submission_batch_id,
finalized_review_label,
finalized_review_count,
-- prior review data
has_prior_scv_id_annotation,
has_prior_scv_ver_annotation,
has_prior_finalized_submission_batch_id,
prior_scv_annotations
from `clinvar_curator.cvc_baseline_annotations_as_of`(as_of_date, include_finalized, only_latest)
),
scv_max_release_date AS (
SELECT
id,
MAX(end_release_date) as max_end_release_date
FROM anno as a
JOIN `clinvar_ingest.clinvar_scvs` ON
id = a.scv_id
WHERE
a.release_date >= start_release_date
GROUP BY
id
),
vs_last AS (
SELECT
smrd.id,
cs.version,
cs.variation_id,
cs.start_release_date,
cs.end_release_date,
cs.deleted_release_date,
cs.classif_type,
cs.rank
FROM scv_max_release_date smrd
JOIN `clinvar_ingest.clinvar_scvs` cs
ON
smrd.id = cs.id
AND
smrd.max_end_release_date = cs.end_release_date
),
vcv_max_release_date AS (
SELECT
id,
MAX(end_release_date) as max_end_release_date
FROM anno as a
JOIN `clinvar_ingest.clinvar_vcvs`
ON
id = a.vcv_id
where
a.release_date >= start_release_date
GROUP BY
id
),
vv_last AS (
SELECT
vmrd.id,
cv.version,
cv.variation_id,
cv.start_release_date,
cv.end_release_date,
cv.deleted_release_date,
cv.agg_classification,
cv.rank
FROM vcv_max_release_date vmrd
JOIN `clinvar_ingest.clinvar_vcvs` cv
ON
vmrd.id = cv.id
AND
vmrd.max_end_release_date = cv.end_release_date
)
SELECT
as_of_date,
a.release_date,
a.annotation_id,
-- variant and vcv
a.variation_id,
a.vcv_axn,
a.vcv_id,
a.vcv_ver,
-- scv
a.scv_id,
a.scv_ver,
-- annotation assessment record
a.curator,
a.annotated_on,
a.annotated_date,
a.annotated_time_utc,
a.action,
a.reason,
a.notes,
-- originally annotated scv id+ver assertion data
cs.rpt_stmt_type,
cs.rank,
cs.classif_type,
cs.clinsig_type,
-- submitter from original annotation (should never change)
a.submitter_id,
s.current_name as submitter_name,
s.current_abbrev as submitter_abbrev,
a.annotation_label,
a.finalized_review_label,

a.has_prior_scv_id_annotation,
a.has_prior_scv_ver_annotation,
a.has_prior_finalized_submission_batch_id,
a.prior_scv_annotations,

-- is this the annotation the latest for this scv id (TRUE or Count=0 means no newer annotations currently exist for the exact scv id)
a.is_latest AS is_latest_annotation,

-- what is the latest scv version for this scv id, null if deleted
vs_last.version AS latest_scv_ver,
-- what is the latest scv released date, rank and classification?
vs_last.start_release_date AS latest_scv_release_date,
vs_last.rank as latest_scv_rank,
vs_last.classif_type as latest_scv_classification,

-- what is the latest vcv version for this vcv id, null if deleted
vv_last.version AS latest_vcv_ver,
-- what is the latest vcv release date?
vv_last.start_release_date AS latest_vcv_release_date,

-- is this annotation outdated for this scv id due to an update in the version number or moved to a different variation?
(vs_last.version > a.scv_ver OR vs_last.variation_id <> a.variation_id) AS is_outdated_scv,

-- is this annotation outdated for this vcv id due to an update in the version number
(vv_last.version > a.vcv_ver) AS is_outdated_vcv,

-- has this scv id been completely deleted from the latest release?
(vs_last.deleted_release_date is not null AND vs_last.deleted_release_date <= a.release_date) AS is_deleted_scv,
-- if the scv id record is deleted then this is the first release it was no longer available in.
vs_last.deleted_release_date as deleted_scv_release_date,

-- has this scv id been moved to another variation id in the most recent release?
(vs_last.variation_id <> a.variation_id ) AS is_moved_scv

FROM anno as a
-- we could do an INNER JOIN but if there was an errant record in the annotations
-- sheet that didn't line up with a real scv then it would be inadvertantly hidden
-- So,it is possible (not probable) that the cs.* fields could all be null when returned.
-- same is true for the submitter fields
LEFT JOIN `clinvar_ingest.clinvar_scvs` cs
ON
cs.variation_id = a.variation_id AND
cs.id = a.scv_id AND
cs.version = a.scv_ver AND
a.release_date between cs.start_release_date and cs.end_release_date
LEFT JOIN `clinvar_ingest.clinvar_submitters` s
ON
s.id = a.submitter_id AND
a.release_date between s.start_release_date and s.end_release_date
LEFT JOIN vs_last
ON
vs_last.id = a.scv_id
LEFT JOIN vv_last
ON
vv_last.id = a.vcv_id
);
176 changes: 176 additions & 0 deletions scripts/clinvar-curartion/cvc-baseline-annotations-as-of-func.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
CREATE OR REPLACE TABLE FUNCTION `clinvar_curator.cvc_baseline_annotations_as_of`(as_of_date DATE, incl_finalized BOOL, only_latest BOOL) AS (
WITH anno AS
(
SELECT
rel.release_date,
CAST(UNIX_MILLIS(annotation_date) AS STRING) as annotation_id,
a.vcv_id as vcv_axn,
SPLIT(a.scv_id,'.')[OFFSET(0)] AS scv_id,
CAST(SPLIT(a.scv_id,'.')[OFFSET(1)] AS INT64) AS scv_ver,
CAST(a.variation_id AS String) AS variation_id,
CAST(a.submitter_id AS String) AS submitter_id,
LOWER(a.action) AS action,
SPLIT(a.curator_email,'@')[OFFSET(0)] AS curator,
a.annotation_date AS annotated_on,
DATE(a.annotation_date) AS annotated_date,
TIME(a.annotation_date) AS annotated_time_utc,
a.reason,
a.notes,
SPLIT(a.vcv_id,'.')[OFFSET(0)] AS vcv_id,
CAST(SPLIT(a.vcv_id,'.')[OFFSET(1)] AS INT64) AS vcv_ver,
-- if there are no other scv_id annotations after when orderd by annotation date then it is the latest
(
COUNT(a.annotation_date)
OVER (
PARTITION BY SPLIT(a.scv_id,'.')[OFFSET(0)]
ORDER BY a.annotation_date
ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING
) = 0
) AS is_latest

FROM `clinvar_curator.clinvar_annotations` a,
`clinvar_ingest.schema_on`(as_of_date) rel
WHERE
DATE(a.annotation_date) <= as_of_date
),
reviewed_anno AS (
SELECT
-- rev info
rev.annotation_id,
count(*) as annotation_review_count,
a.scv_id,
a.scv_ver,
a.annotated_date,
rev.reviewer,
rev.status,
rev.notes,
sbm.batch_id,
FORMAT(
'%s (%s) %s%s',
IFNULL(rev.status, 'n/a'),
IF(rev.annotation_id IS NULL, NULL, IFNULL(rev.reviewer, 'n/a')),
IFNULL(FORMAT('*%s*',sbm.batch_id), ''),
IF(COUNT(*)>1, FORMAT('-%ix?',COUNT(*)), '')
) as review_label
FROM `clinvar_curator.cvc_clinvar_reviews` rev
JOIN anno a ON a.annotation_id = rev.annotation_id
LEFT JOIN `clinvar_curator.cvc_clinvar_submissions` sbm
ON
sbm.annotation_id = rev.annotation_id
group by
rev.annotation_id,
a.scv_id,
a.scv_ver,
a.annotated_date,
rev.reviewer,
rev.status,
rev.notes,
sbm.batch_id
),
anno_label as (
SELECT
a.annotation_id,
FORMAT(
'%t (%s) %s: %s',
a.annotated_date,
IFNULL(a.curator,'n/a'),
IF(a.action ='flagging candidate','flag',IF(a.action = 'no change', 'no chg', 'n/a or unk' )),
LEFT(IFNULL(a.reason,'n/a'), 20)||IF(LENGTH(a.reason) > 20,'...','')
) as annotation_label
FROM anno a
),
ra_priors AS (
SELECT
a.annotation_id,
(COUNTIF(a.scv_id = prior_a.scv_id) > 0) as has_prior_scv_id_annotation,
(COUNTIF(a.scv_ver = prior_a.scv_ver) > 0) as has_prior_scv_ver_annotation,
(COUNTIF(prior_ra.batch_id is not null) > 0) as has_prior_finalized_submission_batch_id,
STRING_AGG(
FORMAT(
'v%i %s %s',
prior_a.scv_ver,
prior_a_label.annotation_label,
if(prior_ra.review_label is not null, FORMAT('[ %s ]',prior_ra.review_label), '')
),
'\n'
ORDER BY prior_a.annotated_date DESC
) as prior_scv_annotations

FROM anno as a
LEFT JOIN reviewed_anno as ra
ON
ra.annotation_id = a.annotation_id

JOIN anno as prior_a
ON
prior_a.scv_id = a.scv_id and
prior_a.annotation_id < a.annotation_id
JOIN anno_label prior_a_label
ON
prior_a.annotation_id = prior_a_label.annotation_id

LEFT JOIN reviewed_anno as prior_ra
ON
prior_ra.annotation_id = prior_a.annotation_id
WHERE
IF(only_latest, a.is_latest, true)
AND
IF(incl_finalized, true, ra.annotation_id is NULL)
GROUP BY
a.annotation_id
)
SELECT
as_of_date,
a.release_date,
a.annotation_id,
-- variant and vcv
a.variation_id,
a.vcv_axn,
a.vcv_id,
a.vcv_ver,
-- scv
a.scv_id,
a.scv_ver,
-- annotation assessment record
a.curator,
a.annotated_on,
a.annotated_date,
a.annotated_time_utc,
a.action,
a.reason,
a.notes,
a.submitter_id,
FORMAT(
'%t (%s) %s: %s',
a.annotated_date,
IFNULL(a.curator,'n/a'),
IF(a.action ='flagging candidate','flag',IF(a.action = 'no change', 'no chg', 'n/a or unk' )),
LEFT(IFNULL(a.reason,'n/a'), 20)||IF(LENGTH(a.reason) > 20,'...','')
) as annotation_label,
a.is_latest,
-- finalized rev info
ra.annotation_id as finalized_review_id,
ra.reviewer as finalized_reviewer,
ra.status as finalized_review_status,
ra.notes as finalized_review_notes,
-- finalized submission batch info
ra.batch_id as finalized_submission_batch_id,
ra.review_label as finalized_review_label,
ra.annotation_review_count as finalized_review_count,
-- prior review data
ra_priors.has_prior_scv_id_annotation,
ra_priors.has_prior_scv_ver_annotation,
ra_priors.has_prior_finalized_submission_batch_id,
ra_priors.prior_scv_annotations
FROM anno as a
LEFT JOIN reviewed_anno ra
ON
ra.annotation_id = a.annotation_id
LEFT JOIN ra_priors
ON
ra_priors.annotation_id = a.annotation_id
WHERE
IF(only_latest, a.is_latest, true)
AND
IF(incl_finalized, true, ra.annotation_id is NULL)
);
Loading

0 comments on commit 278ac30

Please sign in to comment.