Skip to content

Commit

Permalink
Merge pull request #4170 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Jun 17, 2024
2 parents 5344b0c + b580f5b commit d0545e7
Show file tree
Hide file tree
Showing 70 changed files with 531 additions and 274 deletions.
1 change: 1 addition & 0 deletions hail_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
PATHOGENICTY_HGMD_SORT_KEY = 'pathogenicity_hgmd'
ABSENT_PATH_SORT_OFFSET = 12.5
CONSEQUENCE_SORT = 'protein_consequence'
ALPHAMISSENSE_SORT = 'alphamissense'
OMIM_SORT = 'in_omim'

ALT_ALT = 'alt_alt'
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/02/26 15:45:13
Created at 2024/06/10 16:51:30
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/04/18 10:25:58
Created at 2024/06/14 15:14:52
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6 changes: 5 additions & 1 deletion hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,13 +1027,17 @@ def _sort_order(self, ht):
sort_expressions = self._get_sort_expressions(ht, self._sort) + sort_expressions
return sort_expressions

@staticmethod
def _format_prediction_sort_value(value):
return hl.or_else(-hl.float64(value), 0)

def _get_sort_expressions(self, ht, sort):
if sort in self.SORTS:
return self.SORTS[sort](ht)

if sort in self.PREDICTION_FIELDS_CONFIG:
prediction_path = self.PREDICTION_FIELDS_CONFIG[sort]
return [hl.or_else(-hl.float64(ht[prediction_path.source][prediction_path.field]), 0)]
return [self._format_prediction_sort_value(ht[prediction_path.source][prediction_path.field])]

if sort == OMIM_SORT:
return self._omim_sort(ht, hl.set(set(self._sort_metadata)))
Expand Down
5 changes: 5 additions & 0 deletions hail_search/queries/ont_snv_indel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from hail_search.queries.base import BaseHailTableQuery
from hail_search.queries.snv_indel import SnvIndelHailTableQuery
from hail_search.queries.snv_indel_37 import SnvIndelHailTableQuery37


class OntSnvIndelHailTableQuery(SnvIndelHailTableQuery):
Expand All @@ -15,3 +16,7 @@ def _get_loaded_filter_ht(self, *args, **kwargs):

def _add_project_lookup_data(self, *args, **kwargs):
raise HTTPBadRequest(reason='Variant lookup is not supported for ONT data')

@staticmethod
def _get_allowed_transcripts_filter(allowed_consequence_ids):
return SnvIndelHailTableQuery37._get_allowed_transcripts_filter(allowed_consequence_ids)
30 changes: 29 additions & 1 deletion hail_search/queries/snv_indel.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from collections import OrderedDict
import hail as hl

from hail_search.constants import GENOME_VERSION_GRCh38, SCREEN_KEY, PREFILTER_FREQ_CUTOFF
from hail_search.constants import GENOME_VERSION_GRCh38, SCREEN_KEY, PREFILTER_FREQ_CUTOFF, ALPHAMISSENSE_SORT
from hail_search.queries.base import BaseHailTableQuery, PredictionPath
from hail_search.queries.snv_indel_37 import SnvIndelHailTableQuery37

EXTENDED_SPLICE_REGION_CONSEQUENCE = 'extended_intronic_splice_region_variant'


class SnvIndelHailTableQuery(SnvIndelHailTableQuery37):

Expand All @@ -24,6 +26,32 @@ class SnvIndelHailTableQuery(SnvIndelHailTableQuery37):
('is_gt_5_percent', 0.05),
('is_gt_10_percent', 0.1),
])
SORTS = {
**SnvIndelHailTableQuery37.SORTS,
ALPHAMISSENSE_SORT: lambda r: [
SnvIndelHailTableQuery37._format_prediction_sort_value(
hl.min(r.sorted_transcript_consequences.map(lambda t: t.alphamissense.pathogenicity))
),
SnvIndelHailTableQuery37._format_prediction_sort_value(r.selected_transcript.alphamissense.pathogenicity),
],
}

def _get_allowed_consequence_ids(self, annotations):
consequence_ids = super()._get_allowed_consequence_ids(annotations)
if EXTENDED_SPLICE_REGION_CONSEQUENCE in (annotations.get('extended_splice_site') or []):
consequence_ids.add(EXTENDED_SPLICE_REGION_CONSEQUENCE)
return consequence_ids

@staticmethod
def _get_allowed_transcripts_filter(allowed_consequence_ids):
has_extended_splice = EXTENDED_SPLICE_REGION_CONSEQUENCE in allowed_consequence_ids
allowed_consequence_ids = allowed_consequence_ids - {EXTENDED_SPLICE_REGION_CONSEQUENCE}
allowed_consequence_filter = SnvIndelHailTableQuery37._get_allowed_transcripts_filter(allowed_consequence_ids)

if not has_extended_splice:
return allowed_consequence_filter

return lambda tc: allowed_consequence_filter(tc) | tc.spliceregion.extended_intronic_splice_region_variant

def _get_annotation_override_filters(self, ht, annotation_overrides):
annotation_filters = super()._get_annotation_override_filters(ht, annotation_overrides)
Expand Down
2 changes: 2 additions & 0 deletions hail_search/queries/snv_indel_37.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class SnvIndelHailTableQuery37(MitoHailTableQuery):
PATHOGENICITY_FIELD_MAP = {}
ANNOTATION_OVERRIDE_FIELDS = [SPLICE_AI_FIELD]

CORE_FIELDS = MitoHailTableQuery.CORE_FIELDS + ['CAID']

LIFTOVER_ANNOTATION_FIELDS = {}
BASE_ANNOTATION_FIELDS = {
k: v for k, v in MitoHailTableQuery.BASE_ANNOTATION_FIELDS.items()
Expand Down
63 changes: 40 additions & 23 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@
'transcripts': {},
'mainTranscriptId': None,
'selectedMainTranscriptId': None,
'sortedMotifFeatureConsequences': None,
'sortedRegulatoryFeatureConsequences': None,
'_sort': [1000010146],
'CAID': 'CA520798130',
}

GRCH37_VARIANT = {
Expand Down Expand Up @@ -119,6 +122,7 @@
'mainTranscriptId': 'ENST00000420911',
'selectedMainTranscriptId': None,
'_sort': [7143270172],
'CAID': 'CA4540310',
}

FAMILY_3_VARIANT = deepcopy(VARIANT3)
Expand All @@ -134,11 +138,12 @@
MULTI_FAMILY_VARIANT['familyGuids'] += FAMILY_3_VARIANT['familyGuids']
MULTI_FAMILY_VARIANT['genotypes'].update(FAMILY_3_VARIANT['genotypes'])

SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000497611'}
SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000426137'}
SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3 = {**VARIANT3, 'selectedMainTranscriptId': 'ENST00000426137'}
SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000641759'}
MULTI_DATA_TYPE_COMP_HET_VARIANT2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000641820'}
SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000426137'}
SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000497611'}
SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4 = {**VARIANT4, 'selectedMainTranscriptId': 'ENST00000350997'}
SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3 = {**VARIANT3, 'selectedMainTranscriptId': 'ENST00000497611'}
SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000459627'}
MULTI_DATA_TYPE_COMP_HET_VARIANT2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000450625'}

PROJECT_2_VARIANT1 = deepcopy(VARIANT1)
PROJECT_2_VARIANT1['familyGuids'] = ['F000011_11']
Expand Down Expand Up @@ -848,20 +853,20 @@ async def test_annotations_filter(self):
'structural_consequence': ['INTRONIC', 'LOF'],
}
await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4, MITO_VARIANT2, MITO_VARIANT3], pathogenicity=pathogenicity,
[VARIANT1, VARIANT2, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4, MITO_VARIANT2, MITO_VARIANT3], pathogenicity=pathogenicity,
annotations=annotations, sample_data=FAMILY_2_ALL_SAMPLE_DATA,
)

await self._assert_expected_search(
[VARIANT2, VARIANT4, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], annotations=annotations,
[VARIANT2, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], annotations=annotations,
)

await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA)

annotations['splice_ai'] = '0.005'
annotations['structural'] = ['gCNV_DUP', 'DEL']
await self._assert_expected_search(
[VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4, GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4],
[VARIANT2, MULTI_FAMILY_VARIANT, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4, GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4],
annotations=annotations,
)

Expand All @@ -885,22 +890,28 @@ async def test_annotations_filter(self):
pathogenicity=pathogenicity, annotations=annotations, sample_data=FAMILY_2_ALL_SAMPLE_DATA,
)

annotations['extended_splice_site'] = ['extended_intronic_splice_region_variant']
await self._assert_expected_search(
[VARIANT1, VARIANT3, VARIANT4, MITO_VARIANT1, MITO_VARIANT3],
pathogenicity=pathogenicity, annotations=annotations, sample_data=FAMILY_2_ALL_SAMPLE_DATA,
)

async def test_secondary_annotations_filter(self):
annotations_1 = {'missense': ['missense_variant']}
annotations_2 = {'other': ['intron_variant']}

await self._assert_expected_search(
[[VARIANT3, VARIANT4]], inheritance_mode='compound_het', omit_sample_type='SV_WES',
[[VARIANT3, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4]], inheritance_mode='compound_het', omit_sample_type='SV_WES',
annotations=annotations_1, annotations_secondary=annotations_2,
)

await self._assert_expected_search(
[VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES',
[VARIANT2, [VARIANT3, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4]], inheritance_mode='recessive', omit_sample_type='SV_WES',
annotations=annotations_1, annotations_secondary=annotations_2,
)

await self._assert_expected_search(
[[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES',
[[VARIANT3, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4]], inheritance_mode='recessive', omit_sample_type='SV_WES',
annotations=annotations_2, annotations_secondary=annotations_1,
)

Expand Down Expand Up @@ -935,7 +946,7 @@ async def test_secondary_annotations_filter(self):
)

await self._assert_expected_search(
[VARIANT2, [MULTI_DATA_TYPE_COMP_HET_VARIANT2, GCNV_VARIANT4], [VARIANT3, VARIANT4], GCNV_VARIANT3, [GCNV_VARIANT3, GCNV_VARIANT4]],
[VARIANT2, [MULTI_DATA_TYPE_COMP_HET_VARIANT2, GCNV_VARIANT4], [VARIANT3, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4], GCNV_VARIANT3, [GCNV_VARIANT3, GCNV_VARIANT4]],
inheritance_mode='recessive',
annotations={**annotations_1, **gcnv_annotations_1}, annotations_secondary={**annotations_2, **gcnv_annotations_2},
)
Expand All @@ -955,7 +966,7 @@ async def test_secondary_annotations_filter(self):

pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}
await self._assert_expected_search(
[VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES',
[VARIANT2, [VARIANT3, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_4]], inheritance_mode='recessive', omit_sample_type='SV_WES',
annotations=annotations_2, annotations_secondary=annotations_1, pathogenicity=pathogenicity,
)

Expand Down Expand Up @@ -1069,8 +1080,8 @@ async def test_search_errors(self):

async def test_sort(self):
await self._assert_expected_search(
[_sorted(VARIANT2, [11, 11]), _sorted(VARIANT4, [11, 11]), _sorted(MITO_VARIANT2, [11, 11]),
_sorted(MITO_VARIANT3, [17, 17]), _sorted(MITO_VARIANT1, [22, 22]), _sorted(VARIANT3, [22, 24]),
[_sorted(VARIANT4, [2, 2]), _sorted(MITO_VARIANT2, [11, 11]), _sorted(VARIANT2, [12, 12]),
_sorted(MITO_VARIANT3, [17, 17]), _sorted(MITO_VARIANT1, [22, 22]), _sorted(VARIANT3, [26, 27]),
_sorted(VARIANT1, [None, None])], sample_data=FAMILY_2_ALL_SAMPLE_DATA, sort='protein_consequence',
)

Expand All @@ -1080,9 +1091,9 @@ async def test_sort(self):
)

await self._assert_expected_search(
[_sorted(GCNV_VARIANT2, [4.5, 0]), _sorted(GCNV_VARIANT3, [4.5, 0]), _sorted(GCNV_VARIANT4, [4.5, 0]),
_sorted(GCNV_VARIANT1, [4.5, 3]), _sorted(VARIANT2, [11, 11]), _sorted(VARIANT4, [11, 11]),
_sorted(MULTI_FAMILY_VARIANT, [22, 24]), _sorted(VARIANT1, [None, None])], sort='protein_consequence',
[_sorted(VARIANT4, [2, 2]), _sorted(GCNV_VARIANT2, [4.5, 0]), _sorted(GCNV_VARIANT3, [4.5, 0]), _sorted(GCNV_VARIANT4, [4.5, 0]),
_sorted(GCNV_VARIANT1, [4.5, 3]), _sorted(VARIANT2, [12, 12]),
_sorted(MULTI_FAMILY_VARIANT, [26, 27]), _sorted(VARIANT1, [None, None])], sort='protein_consequence',
)

await self._assert_expected_search(
Expand All @@ -1091,8 +1102,8 @@ async def test_sort(self):
)

await self._assert_expected_search(
[_sorted(VARIANT4, [11, 11]), _sorted(SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [11, 22]),
_sorted(SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT, [22, 22])],
[_sorted(VARIANT4, [2, 2]), _sorted(SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [12, 26]),
_sorted(SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT, [26, 26])],
omit_sample_type='SV_WES', sort='protein_consequence',
annotations={'other': ['non_coding_transcript_exon_variant'], 'splice_ai': '0'},
)
Expand Down Expand Up @@ -1153,6 +1164,11 @@ async def test_sort(self):
_sorted(VARIANT1, [0])], omit_sample_type='SV_WES', sort='splice_ai',
)

await self._assert_expected_search(
[_sorted(VARIANT2, [-0.9977999925613403, -0.9977999925613403]), _sorted(VARIANT1, [0, 0]),
_sorted(MULTI_FAMILY_VARIANT, [0, 0]), _sorted(VARIANT4, [0, 0])], omit_sample_type='SV_WES', sort='alphamissense',
)

sort = 'in_omim'
await self._assert_expected_search(
[_sorted(MULTI_FAMILY_VARIANT, [0, -2]), _sorted(VARIANT2, [0, -1]), _sorted(VARIANT4, [0, -1]), _sorted(VARIANT1, [1, 0])],
Expand Down Expand Up @@ -1225,9 +1241,10 @@ async def test_sort(self):

async def test_multi_data_type_comp_het_sort(self):
await self._assert_expected_search(
[_sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])],
[_sorted(GCNV_VARIANT4, [4.5, 0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [11, 11])],
_sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]],
[[_sorted(VARIANT4, [2, 2]), _sorted(VARIANT3, [26, 27])],
_sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])],
[_sorted(GCNV_VARIANT4, [4.5, 0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [12, 12])],
_sorted(VARIANT2, [12, 12])],
sort='protein_consequence', inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS,
)

Expand Down
Loading

0 comments on commit d0545e7

Please sign in to comment.