-
Notifications
You must be signed in to change notification settings - Fork 89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vlm match endpoint #4506
Vlm match endpoint #4506
Changes from all commits
8b9812b
8e0475a
a555f2c
32d7819
6a956e9
cbed1b4
774e905
5f8e84b
8ab5148
fd5c697
110f143
8fa19fb
6f93585
6c69bbf
3273b93
e9aed69
25110a3
1c93e5c
0a33f26
de8bd93
62c9813
4a1395b
69dc350
753d3b4
07a9a1c
ce5dbf0
761e2f7
1c463cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
This folder comprises a Hail (www.hail.is) native Table or MatrixTable. | ||
Written with version 0.2.133-4c60fddb171a | ||
Created at 2024/12/04 13:07:33 | ||
Written with version 0.2.128-eead8100a1c1 | ||
Created at 2024/12/05 16:23:46 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
from aiohttp.web import HTTPBadRequest | ||
import hail as hl | ||
import os | ||
|
||
VLM_DATA_DIR = os.environ.get('VLM_DATA_DIR') | ||
SEQR_BASE_URL = os.environ.get('SEQR_BASE_URL') | ||
NODE_ID = os.environ.get('NODE_ID') | ||
|
||
BEACON_HANDOVER_TYPE = { | ||
'id': NODE_ID, | ||
'label': f'{NODE_ID} browser' | ||
} | ||
|
||
BEACON_META = { | ||
'apiVersion': 'v1.0', | ||
'beaconId': 'com.gnx.beacon.v2', | ||
'returnedSchemas': [ | ||
{ | ||
'entityType': 'genomicVariant', | ||
'schema': 'ga4gh-beacon-variant-v2.0.0' | ||
} | ||
] | ||
} | ||
|
||
QUERY_PARAMS = ['assemblyId', 'referenceName', 'start', 'referenceBases', 'alternateBases'] | ||
|
||
GENOME_VERSION_GRCh38 = 'GRCh38' | ||
GENOME_VERSION_GRCh37 = 'GRCh37' | ||
ASSEMBLY_LOOKUP = { | ||
GENOME_VERSION_GRCh37: GENOME_VERSION_GRCh37, | ||
GENOME_VERSION_GRCh38: GENOME_VERSION_GRCh38, | ||
'hg38': GENOME_VERSION_GRCh38, | ||
'hg19': GENOME_VERSION_GRCh37, | ||
} | ||
|
||
def get_variant_match(query: dict) -> dict: | ||
chrom, pos, ref, alt, genome_build = _parse_match_query(query) | ||
locus = hl.locus(chrom, pos, reference_genome=genome_build) | ||
|
||
ac, hom = _get_variant_counts(locus, ref, alt, genome_build) | ||
|
||
liftover_genome_build = GENOME_VERSION_GRCh38 if genome_build == GENOME_VERSION_GRCh37 else GENOME_VERSION_GRCh37 | ||
liftover_locus = hl.liftover(locus, liftover_genome_build) | ||
lift_ac, lift_hom = _get_variant_counts(liftover_locus, ref, alt, liftover_genome_build) | ||
|
||
if lift_ac and not ac: | ||
lifted = hl.eval(liftover_locus) | ||
chrom = lifted.contig | ||
pos = lifted.position | ||
genome_build = liftover_genome_build | ||
genome_build = genome_build.replace('GRCh', '') | ||
url = f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build}&variantId={chrom}-{pos}-{ref}-{alt}' | ||
|
||
return _format_results(ac+lift_ac, hom+lift_hom, url) | ||
|
||
|
||
def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]: | ||
missing_params = [key for key in QUERY_PARAMS if key not in query] | ||
if missing_params: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there's not a huge rush on getting this in, now might be a good time to explore pydantic validation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Theres not a rush to get this specific code out, but I have other things on my plate that I think are higher priority for me. So its less about getting this code released and more about freeing up my time for other feature work |
||
raise HTTPBadRequest(reason=f'Missing required parameters: {", ".join(missing_params)}') | ||
|
||
genome_build = ASSEMBLY_LOOKUP.get(query['assemblyId']) | ||
if not genome_build: | ||
raise HTTPBadRequest(reason=f'Invalid assemblyId: {query["assemblyId"]}') | ||
|
||
chrom = query['referenceName'].replace('chr', '') | ||
if genome_build == GENOME_VERSION_GRCh38: | ||
chrom = f'chr{chrom}' | ||
if not hl.eval(hl.is_valid_contig(chrom, reference_genome=genome_build)): | ||
raise HTTPBadRequest(reason=f'Invalid referenceName: {query["referenceName"]}') | ||
|
||
start = query['start'] | ||
if not start.isnumeric(): | ||
raise HTTPBadRequest(reason=f'Invalid start: {start}') | ||
start = int(start) | ||
if not hl.eval(hl.is_valid_locus(chrom, start, reference_genome=genome_build)): | ||
raise HTTPBadRequest(reason=f'Invalid start: {start}') | ||
|
||
return chrom, start, query['referenceBases'], query['alternateBases'], genome_build | ||
|
||
|
||
def _get_variant_counts(locus: hl.LocusExpression, ref: str, alt: str, genome_build: str) -> hl.Struct: | ||
interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True)) | ||
ht = hl.read_table( | ||
f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True, | ||
) | ||
ht = ht.filter(ht.alleles == hl.array([ref, alt])) | ||
|
||
counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1)) | ||
return (counts[0].AC, counts[0].hom) if counts else (0, 0) | ||
|
||
|
||
def _format_results(ac: int, hom: int, url: str) -> dict: | ||
total = ac - hom # Homozygotes count twice toward the total AC | ||
result_sets = [ | ||
('Homozygous', hom), | ||
('Heterozygous', total - hom), | ||
] if ac else [] | ||
return { | ||
'beaconHandovers': [ | ||
{ | ||
'handoverType': BEACON_HANDOVER_TYPE, | ||
'url': url, | ||
} | ||
], | ||
'meta': BEACON_META, | ||
'responseSummary': { | ||
'exists': bool(ac), | ||
'total': total | ||
}, | ||
'response': { | ||
'resultSets': [ | ||
{ | ||
'exists': True, | ||
'id': f'{NODE_ID} {label}', | ||
'results': [], | ||
'resultsCount': count, | ||
'setType': 'genomicVariant' | ||
} for label, count in result_sets | ||
] | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,19 @@ | ||
from aiohttp import web | ||
import hail as hl | ||
import logging | ||
import os | ||
import traceback | ||
|
||
from vlm.match import get_variant_match, GENOME_VERSION_GRCh38, GENOME_VERSION_GRCh37 | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
JAVA_OPTS_XSS = os.environ.get('JAVA_OPTS_XSS') | ||
MACHINE_MEM = os.environ.get('MACHINE_MEM') | ||
JVM_MEMORY_FRACTION = 0.9 | ||
|
||
LIFTOVER_DIR = f'{os.path.dirname(os.path.abspath(__file__))}/liftover_references' | ||
|
||
|
||
def _handle_exception(e, request): | ||
logger.error(f'{request.headers.get("From")} "{e}"') | ||
|
@@ -25,9 +35,30 @@ async def status(request: web.Request) -> web.Response: | |
return web.json_response({'success': True}) | ||
|
||
|
||
async def match(request: web.Request) -> web.Response: | ||
return web.json_response(get_variant_match(request.query)) | ||
|
||
|
||
async def init_web_app(): | ||
spark_conf = {} | ||
# memory limits adapted from https://github.com/hail-is/hail/blob/main/hail/python/hailtop/hailctl/dataproc/start.py#L321C17-L321C36 | ||
if MACHINE_MEM: | ||
spark_conf['spark.driver.memory'] = f'{int((int(MACHINE_MEM) - 11) * JVM_MEMORY_FRACTION)}g' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code to build There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So ideally yes. However, the docker builds for hail_search and the vlm each only include the single directory for their respective apps, and then the unit tests and releases are only triggered on change to those specific directories. Creating a shared directory that is included in the build for both apps and then updating the github triggers to include the additional shared directory is possible, but it seemed like more work than it was worth to avoid this relatively small amount of copy-paste. However, if you and/or @bpblanken feel like its worth doing than I can do so. However, in the interest of keeping scope sane I would recommend that this gets merged first and then I do the shared code move in a subsequent PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Considering the amount of work it would take to make this code shared, I think it's fine if it's duplicated. |
||
if JAVA_OPTS_XSS: | ||
spark_conf.update( | ||
{f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']}) | ||
hl.init(idempotent=True, spark_conf=spark_conf or None) | ||
|
||
rg37 = hl.get_reference(GENOME_VERSION_GRCh37) | ||
rg38 = hl.get_reference(GENOME_VERSION_GRCh38) | ||
if not rg37.has_liftover(rg38): | ||
rg37.add_liftover(f'{LIFTOVER_DIR}/grch37_to_grch38.over.chain.gz', rg38) | ||
if not rg38.has_liftover(rg37): | ||
rg38.add_liftover(f'{LIFTOVER_DIR}/grch38_to_grch37.over.chain.gz', rg37) | ||
|
||
app = web.Application(middlewares=[error_middleware], client_max_size=(1024 ** 2) * 10) | ||
app.add_routes([ | ||
web.get('/vlm/match', match), | ||
web.get('/vlm/status', status), | ||
]) | ||
return app |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
😎