Skip to content

Commit

Permalink
Merge pull request #4517 from broadinstitute/vlm-multi-build-match
Browse files Browse the repository at this point in the history
Vlm multi-build match
  • Loading branch information
hanars authored Dec 18, 2024
2 parents ce5dbf0 + 761e2f7 commit 1c463cb
Show file tree
Hide file tree
Showing 23 changed files with 62 additions and 26 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/vlm-unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ on:
paths:
- 'vlm/**'
- '.github/workflows/*vlm*.yaml'
- 'hail_search/fixtures/*'
pull_request:
types: [opened, synchronize, reopened]
paths:
- 'vlm/**'
- '.github/workflows/*vlm*.yaml'
- 'hail_search/fixtures/*'

jobs:
vlm:
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.133-4c60fddb171a
Created at 2024/12/04 13:07:33
Written with version 0.2.128-eead8100a1c1
Created at 2024/12/05 16:23:46
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
49 changes: 34 additions & 15 deletions vlm/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,23 @@

def get_variant_match(query: dict) -> dict:
chrom, pos, ref, alt, genome_build = _parse_match_query(query)

locus = hl.locus(chrom, pos, reference_genome=genome_build)
interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True))
ht = hl.read_table(
f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True,
)
ht = ht.filter(ht.alleles==hl.array([ref, alt]))
counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1))

return _format_results(counts, genome_build, f'{chrom}-{pos}-{ref}-{alt}')
ac, hom = _get_variant_counts(locus, ref, alt, genome_build)

liftover_genome_build = GENOME_VERSION_GRCh38 if genome_build == GENOME_VERSION_GRCh37 else GENOME_VERSION_GRCh37
liftover_locus = hl.liftover(locus, liftover_genome_build)
lift_ac, lift_hom = _get_variant_counts(liftover_locus, ref, alt, liftover_genome_build)

if lift_ac and not ac:
lifted = hl.eval(liftover_locus)
chrom = lifted.contig
pos = lifted.position
genome_build = liftover_genome_build
genome_build = genome_build.replace('GRCh', '')
url = f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build}&variantId={chrom}-{pos}-{ref}-{alt}'

return _format_results(ac+lift_ac, hom+lift_hom, url)


def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]:
Expand Down Expand Up @@ -72,22 +79,34 @@ def _parse_match_query(query: dict) -> tuple[str, int, str, str, str]:
return chrom, start, query['referenceBases'], query['alternateBases'], genome_build


def _format_results(counts: hl.Struct, genome_build: str, variant_id: str) -> dict:
def _get_variant_counts(locus: hl.LocusExpression, ref: str, alt: str, genome_build: str) -> hl.Struct:
interval = hl.eval(hl.interval(locus, locus, includes_start=True, includes_end=True))
ht = hl.read_table(
f'{VLM_DATA_DIR}/{genome_build}/SNV_INDEL/annotations.ht', _intervals=[interval], _filter_intervals=True,
)
ht = ht.filter(ht.alleles == hl.array([ref, alt]))

counts = ht.aggregate(hl.agg.take(ht.gt_stats, 1))
return (counts[0].AC, counts[0].hom) if counts else (0, 0)


def _format_results(ac: int, hom: int, url: str) -> dict:
total = ac - hom # Homozygotes count twice toward the total AC
result_sets = [
('Homozygous', counts[0].hom),
('Heterozygous', counts[0].AC - (counts[0].hom * 2)),
] if counts else []
('Homozygous', hom),
('Heterozygous', total - hom),
] if ac else []
return {
'beaconHandovers': [
{
'handoverType': BEACON_HANDOVER_TYPE,
'url': f'{SEQR_BASE_URL}summary_data/variant_lookup?genomeVersion={genome_build.replace("GRCh", "")}&variantId={variant_id}',
'url': url,
}
],
'meta': BEACON_META,
'responseSummary': {
'exists': bool(counts),
'total': (counts[0].AC - counts[0].hom) if counts else 0
'exists': bool(ac),
'total': total
},
'response': {
'resultSets': [
Expand Down
22 changes: 14 additions & 8 deletions vlm/test_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,32 +44,29 @@ async def test_match(self):
},
'responseSummary': {
'exists': True,
'total': 24,
'total': 30,
},
'response': {
'resultSets': [
{
'exists': True,
'id': 'TestVLM Homozygous',
'results': [],
'resultsCount': 4,
'resultsCount': 7,
'setType': 'genomicVariant'
},
{
'exists': True,
'id': 'TestVLM Heterozygous',
'results': [],
'resultsCount': 20,
'resultsCount': 23,
'setType': 'genomicVariant'
},
],
}
})

async with self.client.request('GET', '/vlm/match?assemblyId=hg19&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, {
only_37_response = {
'beaconHandovers': [
{
'handoverType': {
Expand Down Expand Up @@ -111,7 +108,16 @@ async def test_match(self):
},
],
}
})
}
async with self.client.request('GET', '/vlm/match?assemblyId=hg19&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, only_37_response)

async with self.client.request('GET', '/vlm/match?assemblyId=GRCh38&referenceName=chr7&start=143573079&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, only_37_response)

async with self.client.request('GET', '/vlm/match?assemblyId=hg38&referenceName=chr7&start=143270172&referenceBases=A&alternateBases=G') as resp:
self.assertEqual(resp.status, 200)
Expand Down
11 changes: 10 additions & 1 deletion vlm/web_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@
import os
import traceback

from vlm.match import get_variant_match
from vlm.match import get_variant_match, GENOME_VERSION_GRCh38, GENOME_VERSION_GRCh37

logger = logging.getLogger(__name__)

JAVA_OPTS_XSS = os.environ.get('JAVA_OPTS_XSS')
MACHINE_MEM = os.environ.get('MACHINE_MEM')
JVM_MEMORY_FRACTION = 0.9

LIFTOVER_DIR = f'{os.path.dirname(os.path.abspath(__file__))}/liftover_references'


def _handle_exception(e, request):
logger.error(f'{request.headers.get("From")} "{e}"')
Expand Down Expand Up @@ -47,6 +49,13 @@ async def init_web_app():
{f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']})
hl.init(idempotent=True, spark_conf=spark_conf or None)

rg37 = hl.get_reference(GENOME_VERSION_GRCh37)
rg38 = hl.get_reference(GENOME_VERSION_GRCh38)
if not rg37.has_liftover(rg38):
rg37.add_liftover(f'{LIFTOVER_DIR}/grch37_to_grch38.over.chain.gz', rg38)
if not rg38.has_liftover(rg37):
rg38.add_liftover(f'{LIFTOVER_DIR}/grch38_to_grch37.over.chain.gz', rg37)

app = web.Application(middlewares=[error_middleware], client_max_size=(1024 ** 2) * 10)
app.add_routes([
web.get('/vlm/match', match),
Expand Down

0 comments on commit 1c463cb

Please sign in to comment.