Skip to content

Commit

Permalink
Merge pull request #4506 from broadinstitute/vlm-match-endpoint
Browse files Browse the repository at this point in the history
Vlm match endpoint
  • Loading branch information
hanars authored Dec 18, 2024
2 parents 74734c6 + 1c463cb commit d598c52
Show file tree
Hide file tree
Showing 23 changed files with 320 additions and 3 deletions.
9 changes: 8 additions & 1 deletion .github/workflows/vlm-unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ on:
paths:
- 'vlm/**'
- '.github/workflows/*vlm*.yaml'
- 'hail_search/fixtures/*'
pull_request:
types: [opened, synchronize, reopened]
paths:
- 'vlm/**'
- '.github/workflows/*vlm*.yaml'
- 'hail_search/fixtures/*'

jobs:
vlm:
Expand All @@ -29,5 +31,10 @@ jobs:
pip install -r hail_search/requirements-test.txt
- name: Run coverage tests
run: |
export VLM_DATA_DIR=./hail_search/fixtures
export SEQR_BASE_URL=https://test-seqr.org/
export NODE_ID=TestVLM
export MACHINE_MEM=24
export JAVA_OPTS_XSS=16M
coverage run --source="./vlm" --omit="./vlm/__main__.py" -m pytest vlm/
coverage report --fail-under=90
coverage report --fail-under=95
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.133-4c60fddb171a
Created at 2024/12/04 13:07:33
Written with version 0.2.128-eead8100a1c1
Created at 2024/12/05 16:23:46
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
122 changes: 122 additions & 0 deletions vlm/match.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from aiohttp.web import HTTPBadRequest
import hail as hl
import os

VLM_DATA_DIR = os.environ.get('VLM_DATA_DIR')
SEQR_BASE_URL = os.environ.get('SEQR_BASE_URL')
NODE_ID = os.environ.get('NODE_ID')

BEACON_HANDOVER_TYPE = {
'id': NODE_ID,
'label': f'{NODE_ID} browser'
}

BEACON_META = {
'apiVersion': 'v1.0',
'beaconId': 'com.gnx.beacon.v2',
'returnedSchemas': [
{
'entityType': 'genomicVariant',
'schema': 'ga4gh-beacon-variant-v2.0.0'
}
]
}

QUERY_PARAMS = ['assemblyId', 'referenceName', 'start', 'referenceBases', 'alternateBases']

GENOME_VERSION_GRCh38 = 'GRCh38'
GENOME_VERSION_GRCh37 = 'GRCh37'
ASSEMBLY_LOOKUP = {
GENOME_VERSION_GRCh37: GENOME_VERSION_GRCh37,
GENOME_VERSION_GRCh38: GENOME_VERSION_GRCh38,
'hg38': GENOME_VERSION_GRCh38,
'hg19': GENOME_VERSION_GRCh37,
}

def get_variant_match(query: dict) -> dict:
chrom, pos, ref, alt, genome_build = _parse_match_query(query)
locus = hl.locus(chrom, pos, reference_genome=genome_build)

ac, hom = _get_variant_counts(locus, ref, alt, genome_build)

liftover_genome_build = GENOME_VERSION_GRCh38 if genome_build == GENOME_VERSION_GRCh37 else GENOME_VERSION_GRCh37
liftover_locus = hl.liftover(locus, liftover_genome_build)
lift_ac, lift_hom = _get_variant_counts(liftover_locus, ref, alt, liftover_genome_build)

if lift_ac and not ac:
lifted = hl.eval(liftover_locus)
chrom = lifted.contig
pos = lifted.position
genome_build = liftover_genome_build
genome_build = genome_build.replace('GRCh', '')
url = f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build}&variantId={chrom}-{pos}-{ref}-{alt}'

return _format_results(ac+lift_ac, hom+lift_hom, url)


def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]:
missing_params = [key for key in QUERY_PARAMS if key not in query]
if missing_params:
raise HTTPBadRequest(reason=f'Missing required parameters: {", ".join(missing_params)}')

genome_build = ASSEMBLY_LOOKUP.get(query['assemblyId'])
if not genome_build:
raise HTTPBadRequest(reason=f'Invalid assemblyId: {query["assemblyId"]}')

chrom = query['referenceName'].replace('chr', '')
if genome_build == GENOME_VERSION_GRCh38:
chrom = f'chr{chrom}'
if not hl.eval(hl.is_valid_contig(chrom, reference_genome=genome_build)):
raise HTTPBadRequest(reason=f'Invalid referenceName: {query["referenceName"]}')

start = query['start']
if not start.isnumeric():
raise HTTPBadRequest(reason=f'Invalid start: {start}')
start = int(start)
if not hl.eval(hl.is_valid_locus(chrom, start, reference_genome=genome_build)):
raise HTTPBadRequest(reason=f'Invalid start: {start}')

return chrom, start, query['referenceBases'], query['alternateBases'], genome_build


def _get_variant_counts(locus: hl.LocusExpression, ref: str, alt: str, genome_build: str) -> hl.Struct:
interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True))
ht = hl.read_table(
f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True,
)
ht = ht.filter(ht.alleles == hl.array([ref, alt]))

counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1))
return (counts[0].AC, counts[0].hom) if counts else (0, 0)


def _format_results(ac: int, hom: int, url: str) -> dict:
total = ac - hom # Homozygotes count twice toward the total AC
result_sets = [
('Homozygous', hom),
('Heterozygous', total - hom),
] if ac else []
return {
'beaconHandovers': [
{
'handoverType': BEACON_HANDOVER_TYPE,
'url': url,
}
],
'meta': BEACON_META,
'responseSummary': {
'exists': bool(ac),
'total': total
},
'response': {
'resultSets': [
{
'exists': True,
'id': f'{NODE_ID} {label}',
'results': [],
'resultsCount': count,
'setType': 'genomicVariant'
} for label, count in result_sets
]
}
}
157 changes: 157 additions & 0 deletions vlm/test_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,160 @@ async def test_status(self):
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, {'success': True})

async def test_match(self):
async with self.client.request('GET', '/vlm/match?assemblyId=GRCh38&referenceName=1&start=38724419&referenceBases=T&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, {
'beaconHandovers': [
{
'handoverType': {
'id': 'TestVLM',
'label': 'TestVLM browser',
},
'url': 'https://test-seqr.org/summary_data/variant_lookup?genomeVersion=38&variantId=chr1-38724419-T-G',
}
],
'meta': {
'apiVersion': 'v1.0',
'beaconId': 'com.gnx.beacon.v2',
'returnedSchemas': [
{
'entityType': 'genomicVariant',
'schema': 'ga4gh-beacon-variant-v2.0.0',
}
]
},
'responseSummary': {
'exists': True,
'total': 30,
},
'response': {
'resultSets': [
{
'exists': True,
'id': 'TestVLM Homozygous',
'results': [],
'resultsCount': 7,
'setType': 'genomicVariant'
},
{
'exists': True,
'id': 'TestVLM Heterozygous',
'results': [],
'resultsCount': 23,
'setType': 'genomicVariant'
},
],
}
})

only_37_response = {
'beaconHandovers': [
{
'handoverType': {
'id': 'TestVLM',
'label': 'TestVLM browser',
},
'url': 'https://test-seqr.org/summary_data/variant_lookup?genomeVersion=37&variantId=7-143270172-A-G',
}
],
'meta': {
'apiVersion': 'v1.0',
'beaconId': 'com.gnx.beacon.v2',
'returnedSchemas': [
{
'entityType': 'genomicVariant',
'schema': 'ga4gh-beacon-variant-v2.0.0',
}
]
},
'responseSummary': {
'exists': True,
'total': 3203,
},
'response': {
'resultSets': [
{
'exists': True,
'id': 'TestVLM Homozygous',
'results': [],
'resultsCount': 1508,
'setType': 'genomicVariant'
},
{
'exists': True,
'id': 'TestVLM Heterozygous',
'results': [],
'resultsCount': 1695,
'setType': 'genomicVariant'
},
],
}
}
async with self.client.request('GET', '/vlm/match?assemblyId=hg19&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, only_37_response)

async with self.client.request('GET', '/vlm/match?assemblyId=GRCh38&referenceName=chr7&start=143573079&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, only_37_response)

async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, {
'beaconHandovers': [
{
'handoverType': {
'id': 'TestVLM',
'label': 'TestVLM browser',
},
'url': 'https://test-seqr.org/summary_data/variant_lookup?genomeVersion=38&variantId=chr7-143270172-A-G',
}
],
'meta': {
'apiVersion': 'v1.0',
'beaconId': 'com.gnx.beacon.v2',
'returnedSchemas': [
{
'entityType': 'genomicVariant',
'schema': 'ga4gh-beacon-variant-v2.0.0',
}
]
},
'responseSummary': {
'exists': False,
'total': 0,
},
'response': {
'resultSets': [],
}
})

async def test_match_error(self):
async with self.client.request('GET', '/vlm/match') as resp:
self.assertEqual(resp.status, 400)
self.assertEqual(
resp.reason,
'Missing required parameters: assemblyId, referenceName, start, referenceBases, alternateBases',
)

async with self.client.request('GET', '/vlm/match?assemblyId=38&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 400)
self.assertEqual(resp.reason,'Invalid assemblyId: 38')

async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=27&start=143270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 400)
self.assertEqual(resp.reason,'Invalid referenceName: 27')

async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=7&start=1x43270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 400)
self.assertEqual(resp.reason,'Invalid start: 1x43270172')

async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=7&start=999999999&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 400)
self.assertEqual(resp.reason,'Invalid start: 999999999')
31 changes: 31 additions & 0 deletions vlm/web_app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
from aiohttp import web
import hail as hl
import logging
import os
import traceback

from vlm.match import get_variant_match, GENOME_VERSION_GRCh38, GENOME_VERSION_GRCh37

logger = logging.getLogger(__name__)

JAVA_OPTS_XSS = os.environ.get('JAVA_OPTS_XSS')
MACHINE_MEM = os.environ.get('MACHINE_MEM')
JVM_MEMORY_FRACTION = 0.9

LIFTOVER_DIR = f'{os.path.dirname(os.path.abspath(__file__))}/liftover_references'


def _handle_exception(e, request):
logger.error(f'{request.headers.get("From")} "{e}"')
Expand All @@ -25,9 +35,30 @@ async def status(request: web.Request) -> web.Response:
return web.json_response({'success': True})


async def match(request: web.Request) -> web.Response:
return web.json_response(get_variant_match(request.query))


async def init_web_app():
spark_conf = {}
# memory limits adapted from https://github.com/hail-is/hail/blob/main/hail/python/hailtop/hailctl/dataproc/start.py#L321C17-L321C36
if MACHINE_MEM:
spark_conf['spark.driver.memory'] = f'{int((int(MACHINE_MEM) - 11) * JVM_MEMORY_FRACTION)}g'
if JAVA_OPTS_XSS:
spark_conf.update(
{f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']})
hl.init(idempotent=True, spark_conf=spark_conf or None)

rg37 = hl.get_reference(GENOME_VERSION_GRCh37)
rg38 = hl.get_reference(GENOME_VERSION_GRCh38)
if not rg37.has_liftover(rg38):
rg37.add_liftover(f'{LIFTOVER_DIR}/grch37_to_grch38.over.chain.gz', rg38)
if not rg38.has_liftover(rg37):
rg38.add_liftover(f'{LIFTOVER_DIR}/grch38_to_grch37.over.chain.gz', rg37)

app = web.Application(middlewares=[error_middleware], client_max_size=(1024 ** 2) * 10)
app.add_routes([
web.get('/vlm/match', match),
web.get('/vlm/status', status),
])
return app

0 comments on commit d598c52

Please sign in to comment.