-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4506 from broadinstitute/vlm-match-endpoint
Vlm match endpoint
- Loading branch information
Showing
23 changed files
with
320 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file modified
BIN
+0 Bytes
(100%)
hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc
Binary file not shown.
Binary file modified
BIN
+0 Bytes
(100%)
hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc
Binary file not shown.
4 changes: 2 additions & 2 deletions
4
hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/README.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
This folder comprises a Hail (www.hail.is) native Table or MatrixTable. | ||
Written with version 0.2.133-4c60fddb171a | ||
Created at 2024/12/04 13:07:33 | ||
Written with version 0.2.128-eead8100a1c1 | ||
Created at 2024/12/05 16:23:46 |
Binary file removed
BIN
-12 Bytes
...SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.index.crc
Binary file not shown.
Binary file removed
BIN
-12 Bytes
...nnotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.metadata.json.gz.crc
Binary file not shown.
Binary file removed
BIN
-65 Bytes
...Ch37/SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/index
Binary file not shown.
Binary file removed
BIN
-185 Bytes
...DEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/metadata.json.gz
Binary file not shown.
Binary file added
BIN
+12 Bytes
...SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.index.crc
Binary file not shown.
Binary file added
BIN
+12 Bytes
...nnotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.metadata.json.gz.crc
Binary file not shown.
Binary file added
BIN
+80 Bytes
...Ch37/SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/index
Binary file not shown.
Binary file added
BIN
+183 Bytes
...DEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/metadata.json.gz
Binary file not shown.
Binary file modified
BIN
+8 Bytes
(100%)
hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz
Binary file not shown.
Binary file modified
BIN
+0 Bytes
(100%)
hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc
Binary file not shown.
Binary file modified
BIN
+25 Bytes
(100%)
hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz
Binary file not shown.
Binary file removed
BIN
-12 Bytes
...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-32c79149-c8dd-4b54-8db9-097b88a68456.crc
Binary file not shown.
Binary file added
BIN
+12 Bytes
...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.crc
Binary file not shown.
Binary file renamed
BIN
+342 Bytes
...rt-0-32c79149-c8dd-4b54-8db9-097b88a68456 → ...rt-0-8dc59bd2-da29-46c0-badd-a77a850af2d4
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
from aiohttp.web import HTTPBadRequest | ||
import hail as hl | ||
import os | ||
|
||
VLM_DATA_DIR = os.environ.get('VLM_DATA_DIR') | ||
SEQR_BASE_URL = os.environ.get('SEQR_BASE_URL') | ||
NODE_ID = os.environ.get('NODE_ID') | ||
|
||
BEACON_HANDOVER_TYPE = { | ||
'id': NODE_ID, | ||
'label': f'{NODE_ID} browser' | ||
} | ||
|
||
BEACON_META = { | ||
'apiVersion': 'v1.0', | ||
'beaconId': 'com.gnx.beacon.v2', | ||
'returnedSchemas': [ | ||
{ | ||
'entityType': 'genomicVariant', | ||
'schema': 'ga4gh-beacon-variant-v2.0.0' | ||
} | ||
] | ||
} | ||
|
||
QUERY_PARAMS = ['assemblyId', 'referenceName', 'start', 'referenceBases', 'alternateBases'] | ||
|
||
GENOME_VERSION_GRCh38 = 'GRCh38' | ||
GENOME_VERSION_GRCh37 = 'GRCh37' | ||
ASSEMBLY_LOOKUP = { | ||
GENOME_VERSION_GRCh37: GENOME_VERSION_GRCh37, | ||
GENOME_VERSION_GRCh38: GENOME_VERSION_GRCh38, | ||
'hg38': GENOME_VERSION_GRCh38, | ||
'hg19': GENOME_VERSION_GRCh37, | ||
} | ||
|
||
def get_variant_match(query: dict) -> dict: | ||
chrom, pos, ref, alt, genome_build = _parse_match_query(query) | ||
locus = hl.locus(chrom, pos, reference_genome=genome_build) | ||
|
||
ac, hom = _get_variant_counts(locus, ref, alt, genome_build) | ||
|
||
liftover_genome_build = GENOME_VERSION_GRCh38 if genome_build == GENOME_VERSION_GRCh37 else GENOME_VERSION_GRCh37 | ||
liftover_locus = hl.liftover(locus, liftover_genome_build) | ||
lift_ac, lift_hom = _get_variant_counts(liftover_locus, ref, alt, liftover_genome_build) | ||
|
||
if lift_ac and not ac: | ||
lifted = hl.eval(liftover_locus) | ||
chrom = lifted.contig | ||
pos = lifted.position | ||
genome_build = liftover_genome_build | ||
genome_build = genome_build.replace('GRCh', '') | ||
url = f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build}&variantId={chrom}-{pos}-{ref}-{alt}' | ||
|
||
return _format_results(ac+lift_ac, hom+lift_hom, url) | ||
|
||
|
||
def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]: | ||
missing_params = [key for key in QUERY_PARAMS if key not in query] | ||
if missing_params: | ||
raise HTTPBadRequest(reason=f'Missing required parameters: {", ".join(missing_params)}') | ||
|
||
genome_build = ASSEMBLY_LOOKUP.get(query['assemblyId']) | ||
if not genome_build: | ||
raise HTTPBadRequest(reason=f'Invalid assemblyId: {query["assemblyId"]}') | ||
|
||
chrom = query['referenceName'].replace('chr', '') | ||
if genome_build == GENOME_VERSION_GRCh38: | ||
chrom = f'chr{chrom}' | ||
if not hl.eval(hl.is_valid_contig(chrom, reference_genome=genome_build)): | ||
raise HTTPBadRequest(reason=f'Invalid referenceName: {query["referenceName"]}') | ||
|
||
start = query['start'] | ||
if not start.isnumeric(): | ||
raise HTTPBadRequest(reason=f'Invalid start: {start}') | ||
start = int(start) | ||
if not hl.eval(hl.is_valid_locus(chrom, start, reference_genome=genome_build)): | ||
raise HTTPBadRequest(reason=f'Invalid start: {start}') | ||
|
||
return chrom, start, query['referenceBases'], query['alternateBases'], genome_build | ||
|
||
|
||
def _get_variant_counts(locus: hl.LocusExpression, ref: str, alt: str, genome_build: str) -> hl.Struct: | ||
interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True)) | ||
ht = hl.read_table( | ||
f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True, | ||
) | ||
ht = ht.filter(ht.alleles == hl.array([ref, alt])) | ||
|
||
counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1)) | ||
return (counts[0].AC, counts[0].hom) if counts else (0, 0) | ||
|
||
|
||
def _format_results(ac: int, hom: int, url: str) -> dict: | ||
total = ac - hom # Homozygotes count twice toward the total AC | ||
result_sets = [ | ||
('Homozygous', hom), | ||
('Heterozygous', total - hom), | ||
] if ac else [] | ||
return { | ||
'beaconHandovers': [ | ||
{ | ||
'handoverType': BEACON_HANDOVER_TYPE, | ||
'url': url, | ||
} | ||
], | ||
'meta': BEACON_META, | ||
'responseSummary': { | ||
'exists': bool(ac), | ||
'total': total | ||
}, | ||
'response': { | ||
'resultSets': [ | ||
{ | ||
'exists': True, | ||
'id': f'{NODE_ID} {label}', | ||
'results': [], | ||
'resultsCount': count, | ||
'setType': 'genomicVariant' | ||
} for label, count in result_sets | ||
] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters