Merge pull request #4506 from broadinstitute/vlm-match-endpoint

Vlm match endpoint
broadinstitute · Dec 18, 2024 · d598c52 · d598c52
2 parents 74734c6 + 1c463cb
commit d598c52
Show file tree

Hide file tree

Showing 23 changed files with 320 additions and 3 deletions.
diff --git a/.github/workflows/vlm-unit-tests.yaml b/.github/workflows/vlm-unit-tests.yaml
@@ -10,11 +10,13 @@ on:
     paths:
       - 'vlm/**'
       - '.github/workflows/*vlm*.yaml'
+      - 'hail_search/fixtures/*'
   pull_request:
     types: [opened, synchronize, reopened]
     paths:
       - 'vlm/**'
       - '.github/workflows/*vlm*.yaml'
+      - 'hail_search/fixtures/*'
 
 jobs:
   vlm:
@@ -29,5 +31,10 @@ jobs:
           pip install -r hail_search/requirements-test.txt
       - name: Run coverage tests
         run: |
+          export VLM_DATA_DIR=./hail_search/fixtures
+          export SEQR_BASE_URL=https://test-seqr.org/
+          export NODE_ID=TestVLM
+          export MACHINE_MEM=24
+          export JAVA_OPTS_XSS=16M
           coverage run --source="./vlm" --omit="./vlm/__main__.py" -m pytest vlm/
-          coverage report --fail-under=90
+          coverage report --fail-under=95
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/README.txt b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/README.txt
@@ -1,3 +1,3 @@
 This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
-  Written with version 0.2.133-4c60fddb171a
-  Created at 2024/12/04 13:07:33
+  Written with version 0.2.128-eead8100a1c1
+  Created at 2024/12/05 16:23:46
diff --git a/...SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.index.crc b/...SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.index.crc
diff --git a/...nnotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.metadata.json.gz.crc b/...nnotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.metadata.json.gz.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/index b/...Ch37/SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/index
diff --git a/...DEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/metadata.json.gz b/...DEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/metadata.json.gz
diff --git a/...SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.index.crc b/...SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.index.crc
diff --git a/...nnotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.metadata.json.gz.crc b/...nnotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.metadata.json.gz.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/index b/...Ch37/SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/index
diff --git a/...DEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/metadata.json.gz b/...DEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz
diff --git a/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-32c79149-c8dd-4b54-8db9-097b88a68456.crc b/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-32c79149-c8dd-4b54-8db9-097b88a68456.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.crc b/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.crc
diff --git a/...rt-0-32c79149-c8dd-4b54-8db9-097b88a68456 → ...rt-0-8dc59bd2-da29-46c0-badd-a77a850af2d4 b/...rt-0-32c79149-c8dd-4b54-8db9-097b88a68456 → ...rt-0-8dc59bd2-da29-46c0-badd-a77a850af2d4
diff --git a/vlm/liftover_references/grch37_to_grch38.over.chain.gz b/vlm/liftover_references/grch37_to_grch38.over.chain.gz
diff --git a/vlm/liftover_references/grch38_to_grch37.over.chain.gz b/vlm/liftover_references/grch38_to_grch37.over.chain.gz
diff --git a/vlm/match.py b/vlm/match.py
@@ -0,0 +1,122 @@
+from aiohttp.web import HTTPBadRequest
+import hail as hl
+import os
+
+VLM_DATA_DIR = os.environ.get('VLM_DATA_DIR')
+SEQR_BASE_URL = os.environ.get('SEQR_BASE_URL')
+NODE_ID = os.environ.get('NODE_ID')
+
+BEACON_HANDOVER_TYPE = {
+    'id': NODE_ID,
+    'label': f'{NODE_ID} browser'
+}
+
+BEACON_META = {
+    'apiVersion': 'v1.0',
+    'beaconId': 'com.gnx.beacon.v2',
+    'returnedSchemas': [
+        {
+            'entityType': 'genomicVariant',
+            'schema': 'ga4gh-beacon-variant-v2.0.0'
+        }
+    ]
+}
+
+QUERY_PARAMS = ['assemblyId', 'referenceName', 'start', 'referenceBases', 'alternateBases']
+
+GENOME_VERSION_GRCh38 = 'GRCh38'
+GENOME_VERSION_GRCh37 = 'GRCh37'
+ASSEMBLY_LOOKUP = {
+    GENOME_VERSION_GRCh37: GENOME_VERSION_GRCh37,
+    GENOME_VERSION_GRCh38: GENOME_VERSION_GRCh38,
+    'hg38': GENOME_VERSION_GRCh38,
+    'hg19': GENOME_VERSION_GRCh37,
+}
+
+def get_variant_match(query: dict) -> dict:
+    chrom, pos, ref, alt, genome_build = _parse_match_query(query)
+    locus = hl.locus(chrom, pos, reference_genome=genome_build)
+
+    ac, hom = _get_variant_counts(locus, ref, alt, genome_build)
+
+    liftover_genome_build = GENOME_VERSION_GRCh38 if genome_build == GENOME_VERSION_GRCh37 else GENOME_VERSION_GRCh37
+    liftover_locus = hl.liftover(locus, liftover_genome_build)
+    lift_ac, lift_hom = _get_variant_counts(liftover_locus, ref, alt, liftover_genome_build)
+
+    if lift_ac and not ac:
+        lifted = hl.eval(liftover_locus)
+        chrom = lifted.contig
+        pos = lifted.position
+        genome_build = liftover_genome_build
+    genome_build = genome_build.replace('GRCh', '')
+    url = f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build}&variantId={chrom}-{pos}-{ref}-{alt}'
+
+    return _format_results(ac+lift_ac, hom+lift_hom, url)
+
+
+def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]:
+    missing_params = [key for key in QUERY_PARAMS if key not in query]
+    if missing_params:
+        raise HTTPBadRequest(reason=f'Missing required parameters: {", ".join(missing_params)}')
+
+    genome_build = ASSEMBLY_LOOKUP.get(query['assemblyId'])
+    if not genome_build:
+        raise HTTPBadRequest(reason=f'Invalid assemblyId: {query["assemblyId"]}')
+
+    chrom = query['referenceName'].replace('chr', '')
+    if genome_build == GENOME_VERSION_GRCh38:
+        chrom = f'chr{chrom}'
+    if not hl.eval(hl.is_valid_contig(chrom, reference_genome=genome_build)):
+        raise HTTPBadRequest(reason=f'Invalid referenceName: {query["referenceName"]}')
+
+    start = query['start']
+    if not start.isnumeric():
+        raise HTTPBadRequest(reason=f'Invalid start: {start}')
+    start = int(start)
+    if not hl.eval(hl.is_valid_locus(chrom, start, reference_genome=genome_build)):
+        raise HTTPBadRequest(reason=f'Invalid start: {start}')
+
+    return chrom, start, query['referenceBases'], query['alternateBases'], genome_build
+
+
+def _get_variant_counts(locus: hl.LocusExpression, ref: str, alt: str, genome_build: str) -> hl.Struct:
+    interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True))
+    ht = hl.read_table(
+        f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True,
+    )
+    ht = ht.filter(ht.alleles == hl.array([ref, alt]))
+
+    counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1))
+    return (counts[0].AC, counts[0].hom) if counts else (0, 0)
+
+
+def _format_results(ac: int, hom: int, url: str) -> dict:
+    total = ac - hom # Homozygotes count twice toward the total AC
+    result_sets = [
+        ('Homozygous', hom),
+        ('Heterozygous', total - hom),
+    ] if ac else []
+    return {
+        'beaconHandovers': [
+            {
+                'handoverType': BEACON_HANDOVER_TYPE,
+                'url': url,
+            }
+        ],
+        'meta': BEACON_META,
+        'responseSummary': {
+            'exists': bool(ac),
+            'total': total
+        },
+        'response': {
+            'resultSets': [
+                {
+                    'exists': True,
+                    'id': f'{NODE_ID} {label}',
+                    'results': [],
+                    'resultsCount': count,
+                    'setType': 'genomicVariant'
+                } for label, count in result_sets
+            ]
+        }
+    }
diff --git a/vlm/test_vlm.py b/vlm/test_vlm.py
@@ -17,3 +17,160 @@ async def test_status(self):
             self.assertEqual(resp.status, 200)
             resp_json = await resp.json()
         self.assertDictEqual(resp_json, {'success': True})
+
+    async def test_match(self):
+        async with self.client.request('GET', '/vlm/match?assemblyId=GRCh38&referenceName=1&start=38724419&referenceBases=T&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 200)
+            resp_json = await resp.json()
+        self.assertDictEqual(resp_json, {
+            'beaconHandovers': [
+                {
+                    'handoverType': {
+                        'id': 'TestVLM',
+                        'label': 'TestVLM browser',
+                    },
+                    'url': 'https://test-seqr.org/summary_data/variant_lookup?genomeVersion=38&variantId=chr1-38724419-T-G',
+                }
+            ],
+            'meta': {
+                'apiVersion': 'v1.0',
+                'beaconId': 'com.gnx.beacon.v2',
+                'returnedSchemas': [
+                    {
+                        'entityType': 'genomicVariant',
+                        'schema': 'ga4gh-beacon-variant-v2.0.0',
+                    }
+                ]
+            },
+            'responseSummary': {
+                'exists': True,
+                'total': 30,
+            },
+            'response': {
+                'resultSets': [
+                    {
+                        'exists': True,
+                        'id': 'TestVLM Homozygous',
+                        'results': [],
+                        'resultsCount': 7,
+                        'setType': 'genomicVariant'
+                    },
+                    {
+                        'exists': True,
+                        'id': 'TestVLM Heterozygous',
+                        'results': [],
+                        'resultsCount': 23,
+                        'setType': 'genomicVariant'
+                    },
+                ],
+            }
+        })
+
+        only_37_response = {
+            'beaconHandovers': [
+                {
+                    'handoverType': {
+                        'id': 'TestVLM',
+                        'label': 'TestVLM browser',
+                    },
+                    'url': 'https://test-seqr.org/summary_data/variant_lookup?genomeVersion=37&variantId=7-143270172-A-G',
+                }
+            ],
+            'meta': {
+                'apiVersion': 'v1.0',
+                'beaconId': 'com.gnx.beacon.v2',
+                'returnedSchemas': [
+                    {
+                        'entityType': 'genomicVariant',
+                        'schema': 'ga4gh-beacon-variant-v2.0.0',
+                    }
+                ]
+            },
+            'responseSummary': {
+                'exists': True,
+                'total': 3203,
+            },
+            'response': {
+                'resultSets': [
+                    {
+                        'exists': True,
+                        'id': 'TestVLM Homozygous',
+                        'results': [],
+                        'resultsCount': 1508,
+                        'setType': 'genomicVariant'
+                    },
+                    {
+                        'exists': True,
+                        'id': 'TestVLM Heterozygous',
+                        'results': [],
+                        'resultsCount': 1695,
+                        'setType': 'genomicVariant'
+                    },
+                ],
+            }
+        }
+        async with self.client.request('GET', '/vlm/match?assemblyId=hg19&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 200)
+            resp_json = await resp.json()
+        self.assertDictEqual(resp_json, only_37_response)
+
+        async with self.client.request('GET', '/vlm/match?assemblyId=GRCh38&referenceName=chr7&start=143573079&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 200)
+            resp_json = await resp.json()
+        self.assertDictEqual(resp_json, only_37_response)
+
+        async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 200)
+            resp_json = await resp.json()
+        self.assertDictEqual(resp_json, {
+            'beaconHandovers': [
+                {
+                    'handoverType': {
+                        'id': 'TestVLM',
+                        'label': 'TestVLM browser',
+                    },
+                    'url': 'https://test-seqr.org/summary_data/variant_lookup?genomeVersion=38&variantId=chr7-143270172-A-G',
+                }
+            ],
+            'meta': {
+                'apiVersion': 'v1.0',
+                'beaconId': 'com.gnx.beacon.v2',
+                'returnedSchemas': [
+                    {
+                        'entityType': 'genomicVariant',
+                        'schema': 'ga4gh-beacon-variant-v2.0.0',
+                    }
+                ]
+            },
+            'responseSummary': {
+                'exists': False,
+                'total': 0,
+            },
+            'response': {
+                'resultSets': [],
+            }
+        })
+
+    async def test_match_error(self):
+        async with self.client.request('GET', '/vlm/match') as resp:
+            self.assertEqual(resp.status, 400)
+            self.assertEqual(
+                resp.reason,
+                'Missing required parameters: assemblyId, referenceName, start, referenceBases, alternateBases',
+            )
+
+        async with self.client.request('GET', '/vlm/match?assemblyId=38&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 400)
+            self.assertEqual(resp.reason,'Invalid assemblyId: 38')
+
+        async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=27&start=143270172&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 400)
+            self.assertEqual(resp.reason,'Invalid referenceName: 27')
+
+        async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=7&start=1x43270172&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 400)
+            self.assertEqual(resp.reason,'Invalid start: 1x43270172')
+
+        async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=7&start=999999999&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 400)
+            self.assertEqual(resp.reason,'Invalid start: 999999999')
diff --git a/vlm/web_app.py b/vlm/web_app.py
@@ -1,9 +1,19 @@
 from aiohttp import web
+import hail as hl
 import logging
+import os
 import traceback
 
+from vlm.match import get_variant_match, GENOME_VERSION_GRCh38, GENOME_VERSION_GRCh37
+
 logger = logging.getLogger(__name__)
 
+JAVA_OPTS_XSS = os.environ.get('JAVA_OPTS_XSS')
+MACHINE_MEM = os.environ.get('MACHINE_MEM')
+JVM_MEMORY_FRACTION = 0.9
+
+LIFTOVER_DIR = f'{os.path.dirname(os.path.abspath(__file__))}/liftover_references'
+
 
 def _handle_exception(e, request):
     logger.error(f'{request.headers.get("From")} "{e}"')
@@ -25,9 +35,30 @@ async def status(request: web.Request) -> web.Response:
     return web.json_response({'success': True})
 
 
+async def match(request: web.Request) -> web.Response:
+    return web.json_response(get_variant_match(request.query))
+
+
 async def init_web_app():
+    spark_conf = {}
+    # memory limits adapted from https://github.com/hail-is/hail/blob/main/hail/python/hailtop/hailctl/dataproc/start.py#L321C17-L321C36
+    if MACHINE_MEM:
+        spark_conf['spark.driver.memory'] = f'{int((int(MACHINE_MEM) - 11) * JVM_MEMORY_FRACTION)}g'
+    if JAVA_OPTS_XSS:
+        spark_conf.update(
+            {f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']})
+    hl.init(idempotent=True, spark_conf=spark_conf or None)
+
+    rg37 = hl.get_reference(GENOME_VERSION_GRCh37)
+    rg38 = hl.get_reference(GENOME_VERSION_GRCh38)
+    if not rg37.has_liftover(rg38):
+        rg37.add_liftover(f'{LIFTOVER_DIR}/grch37_to_grch38.over.chain.gz', rg38)
+    if not rg38.has_liftover(rg37):
+        rg38.add_liftover(f'{LIFTOVER_DIR}/grch38_to_grch37.over.chain.gz', rg37)
+
     app = web.Application(middlewares=[error_middleware], client_max_size=(1024 ** 2) * 10)
     app.add_routes([
+        web.get('/vlm/match', match),
         web.get('/vlm/status', status),
     ])
     return app