Merge pull request #4517 from broadinstitute/vlm-multi-build-match

Vlm multi-build match
broadinstitute · Dec 18, 2024 · 1c463cb · 1c463cb
2 parents ce5dbf0 + 761e2f7
commit 1c463cb
Show file tree

Hide file tree

Showing 23 changed files with 62 additions and 26 deletions.
diff --git a/.github/workflows/vlm-unit-tests.yaml b/.github/workflows/vlm-unit-tests.yaml
@@ -10,11 +10,13 @@ on:
     paths:
       - 'vlm/**'
       - '.github/workflows/*vlm*.yaml'
+      - 'hail_search/fixtures/*'
   pull_request:
     types: [opened, synchronize, reopened]
     paths:
       - 'vlm/**'
       - '.github/workflows/*vlm*.yaml'
+      - 'hail_search/fixtures/*'
 
 jobs:
   vlm:

diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/README.txt b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/README.txt
@@ -1,3 +1,3 @@
 This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
-  Written with version 0.2.133-4c60fddb171a
-  Created at 2024/12/04 13:07:33
+  Written with version 0.2.128-eead8100a1c1
+  Created at 2024/12/05 16:23:46
diff --git a/...SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.index.crc b/...SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.index.crc
diff --git a/...nnotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.metadata.json.gz.crc b/...nnotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.metadata.json.gz.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/index b/...Ch37/SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/index
diff --git a/...DEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/metadata.json.gz b/...DEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/metadata.json.gz
diff --git a/...SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.index.crc b/...SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.index.crc
diff --git a/...nnotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.metadata.json.gz.crc b/...nnotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.metadata.json.gz.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/index b/...Ch37/SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/index
diff --git a/...DEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/metadata.json.gz b/...DEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz
diff --git a/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-32c79149-c8dd-4b54-8db9-097b88a68456.crc b/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-32c79149-c8dd-4b54-8db9-097b88a68456.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.crc b/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.crc
diff --git a/...rt-0-32c79149-c8dd-4b54-8db9-097b88a68456 → ...rt-0-8dc59bd2-da29-46c0-badd-a77a850af2d4 b/...rt-0-32c79149-c8dd-4b54-8db9-097b88a68456 → ...rt-0-8dc59bd2-da29-46c0-badd-a77a850af2d4
diff --git a/vlm/liftover_references/grch37_to_grch38.over.chain.gz b/vlm/liftover_references/grch37_to_grch38.over.chain.gz
diff --git a/vlm/liftover_references/grch38_to_grch37.over.chain.gz b/vlm/liftover_references/grch38_to_grch37.over.chain.gz
diff --git a/vlm/match.py b/vlm/match.py
@@ -35,16 +35,23 @@
 
 def get_variant_match(query: dict) -> dict:
     chrom, pos, ref, alt, genome_build = _parse_match_query(query)
-
     locus = hl.locus(chrom, pos, reference_genome=genome_build)
-    interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True))
-    ht = hl.read_table(
-        f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True,
-    )
-    ht = ht.filter(ht.alleles==hl.array([ref, alt]))
-    counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1))
 
-    return _format_results(counts, genome_build, f'{chrom}-{pos}-{ref}-{alt}')
+    ac, hom = _get_variant_counts(locus, ref, alt, genome_build)
+
+    liftover_genome_build = GENOME_VERSION_GRCh38 if genome_build == GENOME_VERSION_GRCh37 else GENOME_VERSION_GRCh37
+    liftover_locus = hl.liftover(locus, liftover_genome_build)
+    lift_ac, lift_hom = _get_variant_counts(liftover_locus, ref, alt, liftover_genome_build)
+
+    if lift_ac and not ac:
+        lifted = hl.eval(liftover_locus)
+        chrom = lifted.contig
+        pos = lifted.position
+        genome_build = liftover_genome_build
+    genome_build = genome_build.replace('GRCh', '')
+    url = f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build}&variantId={chrom}-{pos}-{ref}-{alt}'
+
+    return _format_results(ac+lift_ac, hom+lift_hom, url)
 
 
 def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]:
@@ -72,22 +79,34 @@ def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]:
     return chrom, start, query['referenceBases'], query['alternateBases'], genome_build
 
 
-def _format_results(counts: hl.Struct, genome_build: str, variant_id: str) -> dict:
+def _get_variant_counts(locus: hl.LocusExpression, ref: str, alt: str, genome_build: str) -> hl.Struct:
+    interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True))
+    ht = hl.read_table(
+        f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True,
+    )
+    ht = ht.filter(ht.alleles == hl.array([ref, alt]))
+
+    counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1))
+    return (counts[0].AC, counts[0].hom) if counts else (0, 0)
+
+
+def _format_results(ac: int, hom: int, url: str) -> dict:
+    total = ac - hom # Homozygotes count twice toward the total AC
     result_sets = [
-        ('Homozygous', counts[0].hom),
-        ('Heterozygous', counts[0].AC - (counts[0].hom * 2)),
-    ] if counts else []
+        ('Homozygous', hom),
+        ('Heterozygous', total - hom),
+    ] if ac else []
     return {
         'beaconHandovers': [
             {
                 'handoverType': BEACON_HANDOVER_TYPE,
-                'url': f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build.replace("GRCh", "")}&variantId={variant_id}',
+                'url': url,
             }
         ],
         'meta': BEACON_META,
         'responseSummary': {
-            'exists': bool(counts),
-            'total': (counts[0].AC - counts[0].hom) if counts else 0
+            'exists': bool(ac),
+            'total': total
         },
         'response': {
             'resultSets': [

diff --git a/vlm/test_vlm.py b/vlm/test_vlm.py
@@ -44,32 +44,29 @@ async def test_match(self):
             },
             'responseSummary': {
                 'exists': True,
-                'total': 24,
+                'total': 30,
             },
             'response': {
                 'resultSets': [
                     {
                         'exists': True,
                         'id': 'TestVLM Homozygous',
                         'results': [],
-                        'resultsCount': 4,
+                        'resultsCount': 7,
                         'setType': 'genomicVariant'
                     },
                     {
                         'exists': True,
                         'id': 'TestVLM Heterozygous',
                         'results': [],
-                        'resultsCount': 20,
+                        'resultsCount': 23,
                         'setType': 'genomicVariant'
                     },
                 ],
             }
         })
 
-        async with self.client.request('GET', '/vlm/match?assemblyId=hg19&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
-            self.assertEqual(resp.status, 200)
-            resp_json = await resp.json()
-        self.assertDictEqual(resp_json, {
+        only_37_response = {
             'beaconHandovers': [
                 {
                     'handoverType': {
@@ -111,7 +108,16 @@ async def test_match(self):
                     },
                 ],
             }
-        })
+        }
+        async with self.client.request('GET', '/vlm/match?assemblyId=hg19&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 200)
+            resp_json = await resp.json()
+        self.assertDictEqual(resp_json, only_37_response)
+
+        async with self.client.request('GET', '/vlm/match?assemblyId=GRCh38&referenceName=chr7&start=143573079&referenceBases=A&alternateBases=G') as resp:
+            self.assertEqual(resp.status, 200)
+            resp_json = await resp.json()
+        self.assertDictEqual(resp_json, only_37_response)
 
         async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
             self.assertEqual(resp.status, 200)

diff --git a/vlm/web_app.py b/vlm/web_app.py
@@ -4,14 +4,16 @@
 import os
 import traceback
 
-from vlm.match import get_variant_match
+from vlm.match import get_variant_match, GENOME_VERSION_GRCh38, GENOME_VERSION_GRCh37
 
 logger = logging.getLogger(__name__)
 
 JAVA_OPTS_XSS = os.environ.get('JAVA_OPTS_XSS')
 MACHINE_MEM = os.environ.get('MACHINE_MEM')
 JVM_MEMORY_FRACTION = 0.9
 
+LIFTOVER_DIR = f'{os.path.dirname(os.path.abspath(__file__))}/liftover_references'
+
 
 def _handle_exception(e, request):
     logger.error(f'{request.headers.get("From")} "{e}"')
@@ -47,6 +49,13 @@ async def init_web_app():
             {f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']})
     hl.init(idempotent=True, spark_conf=spark_conf or None)
 
+    rg37 = hl.get_reference(GENOME_VERSION_GRCh37)
+    rg38 = hl.get_reference(GENOME_VERSION_GRCh38)
+    if not rg37.has_liftover(rg38):
+        rg37.add_liftover(f'{LIFTOVER_DIR}/grch37_to_grch38.over.chain.gz', rg38)
+    if not rg38.has_liftover(rg37):
+        rg38.add_liftover(f'{LIFTOVER_DIR}/grch38_to_grch37.over.chain.gz', rg37)
+
     app = web.Application(middlewares=[error_middleware], client_max_size=(1024 ** 2) * 10)
     app.add_routes([
         web.get('/vlm/match', match),