Skip to content

Commit

Permalink
Merge pull request #4157 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Jun 11, 2024
2 parents 7b7f1a1 + 163215b commit 5344b0c
Show file tree
Hide file tree
Showing 26 changed files with 674 additions and 365 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## dev

## 6/11/24
* Add "Partial Phenotype Contribution" functional tag (REQUIRES DB MIGRATION)

## 5/24/24
* Adds external_data to Family model (REQUIRES DB MIGRATION)
* Adds post_discovery_mondo_id to Family model (REQUIRES DB MIGRATION)
Expand Down
29 changes: 18 additions & 11 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ class BaseHailTableQuery(object):
'transcripts': {
'response_key': 'transcripts',
'empty_array': True,
'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}),
'format_array_values': lambda values, *args: values.group_by(lambda t: t.geneId),
},
}
Expand Down Expand Up @@ -166,6 +165,10 @@ def _format_enum_response(self, k, enum):
value = lambda r: self._format_enum(r, k, enum, ht_globals=self._globals, **enum_config)
return enum_config.get('response_key', _to_camel_case(k)), value

@staticmethod
def _camelcase_value(value):
return value.rename({k: _to_camel_case(k) for k in value.keys()})

@classmethod
def _format_enum(cls, r, field, enum, empty_array=False, format_array_values=None, **kwargs):
if hasattr(r, f'{field}_id'):
Expand All @@ -175,29 +178,33 @@ def _format_enum(cls, r, field, enum, empty_array=False, format_array_values=Non
if hasattr(value, 'map'):
if empty_array:
value = hl.or_else(value, hl.empty_array(value.dtype.element_type))
value = value.map(lambda x: cls._enum_field(field, x, enum, **kwargs))
value = value.map(lambda x: cls._enum_field(field, x, enum, **kwargs, format_value=cls._camelcase_value))
if format_array_values:
value = format_array_values(value, r)
return value

return cls._enum_field(field, value, enum, **kwargs)

@staticmethod
def _enum_field(field_name, value, enum, ht_globals=None, annotate_value=None, format_value=None, drop_fields=None, enum_keys=None, include_version=False, **kwargs):
@classmethod
def _enum_field(cls, field_name, value, enum, ht_globals=None, annotate_value=None, format_value=None, drop_fields=None, enum_keys=None, include_version=False, **kwargs):
annotations = {}
drop = [] + (drop_fields or [])
value_keys = value.keys()
for field in (enum_keys or enum.keys()):
field_enum = enum[field]
is_nested_struct = field in value_keys
is_array = f'{field}_ids' in value_keys
value_field = f"{field}_id{'s' if is_array else ''}"
drop.append(value_field)

enum_array = hl.array(field_enum)
if is_array:
annotations[f'{field}s'] = value[value_field].map(lambda v: enum_array[v])
if is_nested_struct:
annotations[field] = cls._enum_field(field, value[field], field_enum, format_value=format_value)
else:
annotations[field] = enum_array[value[value_field]]
value_field = f"{field}_id{'s' if is_array else ''}"
drop.append(value_field)
enum_array = hl.array(field_enum)
if is_array:
annotations[f'{field}s'] = value[value_field].map(lambda v: enum_array[v])
else:
annotations[field] = enum_array[value[value_field]]

if include_version:
annotations['version'] = ht_globals['versions'][field_name]
Expand Down Expand Up @@ -642,7 +649,7 @@ def _parse_intervals(self, intervals, gene_ids=None, **kwargs):
return parsed_intervals

def _should_add_chr_prefix(self):
return True
return self.GENOME_VERSION == GENOME_VERSION_GRCh38

def _filter_by_frequency(self, ht, frequencies, pathogenicity):
frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS}
Expand Down
3 changes: 3 additions & 0 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ class MitoHailTableQuery(BaseHailTableQuery):
**BaseHailTableQuery.ENUM_ANNOTATION_FIELDS['transcripts'],
'annotate_value': lambda transcript, *args: {'major_consequence': transcript.consequence_terms.first()},
'drop_fields': ['consequence_terms'],
'format_array_values': lambda values, *args: BaseHailTableQuery.ENUM_ANNOTATION_FIELDS['transcripts']['format_array_values'](values).map_values(
lambda transcripts: hl.enumerate(transcripts).starmap(lambda i, t: t.annotate(transcriptRank=i))
),
}
}

Expand Down
118 changes: 9 additions & 109 deletions hail_search/queries/snv_indel.py
Original file line number Diff line number Diff line change
@@ -1,135 +1,35 @@
from collections import OrderedDict
import hail as hl

from hail_search.constants import CLINVAR_KEY, CLINVAR_MITO_KEY, HGMD_KEY, HGMD_PATH_RANGES, \
GNOMAD_GENOMES_FIELD, PREFILTER_FREQ_CUTOFF, PATH_FREQ_OVERRIDE_CUTOFF, PATHOGENICTY_SORT_KEY, PATHOGENICTY_HGMD_SORT_KEY, \
SCREEN_KEY, SPLICE_AI_FIELD
from hail_search.queries.base import PredictionPath, QualityFilterFormat
from hail_search.queries.mito import MitoHailTableQuery
from hail_search.constants import GENOME_VERSION_GRCh38, SCREEN_KEY, PREFILTER_FREQ_CUTOFF
from hail_search.queries.base import BaseHailTableQuery, PredictionPath
from hail_search.queries.snv_indel_37 import SnvIndelHailTableQuery37


class SnvIndelHailTableQuery(MitoHailTableQuery):
class SnvIndelHailTableQuery(SnvIndelHailTableQuery37):

DATA_TYPE = 'SNV_INDEL'

GENOTYPE_FIELDS = {f.lower(): f for f in ['DP', 'GQ', 'AB']}
QUALITY_FILTER_FORMAT = {
'AB': QualityFilterFormat(override=lambda gt: ~gt.GT.is_het(), scale=100),
}
POPULATIONS = {
'seqr': {'hom': 'hom', 'hemi': None, 'het': None, 'sort': 'callset_af'},
'topmed': {'hemi': None},
'exac': {
'filter_af': 'AF_POPMAX', 'ac': 'AC_Adj', 'an': 'AN_Adj', 'hom': 'AC_Hom', 'hemi': 'AC_Hemi',
'het': 'AC_Het',
},
'gnomad_exomes': {'filter_af': 'AF_POPMAX_OR_GLOBAL', 'het': None, 'sort': 'gnomad_exomes'},
GNOMAD_GENOMES_FIELD: {'filter_af': 'AF_POPMAX_OR_GLOBAL', 'het': None, 'sort': 'gnomad'},
}
PREDICTION_FIELDS_CONFIG_ALL_BUILDS = {
'cadd': PredictionPath('cadd', 'PHRED'),
'eigen': PredictionPath('eigen', 'Eigen_phred'),
'mpc': PredictionPath('mpc', 'MPC'),
'primate_ai': PredictionPath('primate_ai', 'score'),
SPLICE_AI_FIELD: PredictionPath(SPLICE_AI_FIELD, 'delta_score'),
'splice_ai_consequence': PredictionPath(SPLICE_AI_FIELD, 'splice_consequence'),
'mut_taster': PredictionPath('dbnsfp', 'MutationTaster_pred'),
'polyphen': PredictionPath('dbnsfp', 'Polyphen2_HVAR_score'),
'revel': PredictionPath('dbnsfp', 'REVEL_score'),
'sift': PredictionPath('dbnsfp', 'SIFT_score'),
}
PREDICTION_FIELDS_CONFIG_38 = {
GENOME_VERSION = GENOME_VERSION_GRCh38
PREDICTION_FIELDS_CONFIG = {
**SnvIndelHailTableQuery37.PREDICTION_FIELDS_CONFIG,
'fathmm': PredictionPath('dbnsfp', 'fathmm_MKL_coding_score'),
'mut_pred': PredictionPath('dbnsfp', 'MutPred_score'),
'vest': PredictionPath('dbnsfp', 'VEST4_score'),
'gnomad_noncoding': PredictionPath('gnomad_non_coding_constraint', 'z_score'),
}
PREDICTION_FIELDS_CONFIG = {
**PREDICTION_FIELDS_CONFIG_ALL_BUILDS,
**PREDICTION_FIELDS_CONFIG_38
}
PATHOGENICITY_FILTERS = {
**MitoHailTableQuery.PATHOGENICITY_FILTERS,
HGMD_KEY: ('class', HGMD_PATH_RANGES),
}
PATHOGENICITY_FIELD_MAP = {}
ANNOTATION_OVERRIDE_FIELDS = [SPLICE_AI_FIELD, SCREEN_KEY]

BASE_ANNOTATION_FIELDS = {
k: v for k, v in MitoHailTableQuery.BASE_ANNOTATION_FIELDS.items()
if k not in MitoHailTableQuery.MITO_ANNOTATION_FIELDS
}
ENUM_ANNOTATION_FIELDS = {
**MitoHailTableQuery.ENUM_ANNOTATION_FIELDS,
'screen': {
'response_key': 'screenRegionType',
'format_value': lambda value: value.region_types.first(),
},
}
ENUM_ANNOTATION_FIELDS[CLINVAR_KEY] = ENUM_ANNOTATION_FIELDS.pop(CLINVAR_MITO_KEY)

SORTS = {
**MitoHailTableQuery.SORTS,
PATHOGENICTY_SORT_KEY: lambda r: [MitoHailTableQuery.CLINVAR_SORT(CLINVAR_KEY, r)],
PATHOGENICTY_HGMD_SORT_KEY: lambda r: [MitoHailTableQuery.CLINVAR_SORT(CLINVAR_KEY, r), r.hgmd.class_id],
}

LIFTOVER_ANNOTATION_FIELDS = BaseHailTableQuery.LIFTOVER_ANNOTATION_FIELDS
ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery37.ANNOTATION_OVERRIDE_FIELDS + [SCREEN_KEY]
FREQUENCY_PREFILTER_FIELDS = OrderedDict([
(True, PREFILTER_FREQ_CUTOFF),
('is_gt_3_percent', 0.03),
('is_gt_5_percent', 0.05),
('is_gt_10_percent', 0.1),
])

def _prefilter_entries_table(self, ht, *args, **kwargs):
ht = super()._prefilter_entries_table(ht, *args, **kwargs)
if 'variant_ht' not in self._load_table_kwargs and not self._load_table_kwargs.get('_filter_intervals'):
af_ht = self._get_loaded_filter_ht(
GNOMAD_GENOMES_FIELD, 'high_af_variants.ht', self._get_gnomad_af_prefilter, **kwargs)
if af_ht:
ht = ht.filter(hl.is_missing(af_ht[ht.key]))
return ht

def _get_gnomad_af_prefilter(self, frequencies=None, pathogenicity=None, **kwargs):
gnomad_genomes_filter = (frequencies or {}).get(GNOMAD_GENOMES_FIELD, {})
af_cutoff = gnomad_genomes_filter.get('af')
if af_cutoff is None and gnomad_genomes_filter.get('ac') is not None:
af_cutoff = PREFILTER_FREQ_CUTOFF
if af_cutoff is None:
return False

af_cutoff_field = self._get_af_prefilter_field(af_cutoff)
if af_cutoff_field is None:
return False

af_filter = True if af_cutoff_field is True else lambda ht: ht[af_cutoff_field]

if af_cutoff < PATH_FREQ_OVERRIDE_CUTOFF:
clinvar_path_ht = self._get_loaded_clinvar_prefilter_ht(pathogenicity)
if clinvar_path_ht is not False:
path_cutoff_field = self._get_af_prefilter_field(PATH_FREQ_OVERRIDE_CUTOFF)
non_clinvar_filter = lambda ht: hl.is_missing(clinvar_path_ht[ht.key])
if af_filter is not True:
non_clinvar_filter = lambda ht: non_clinvar_filter(ht) & af_filter(ht)
af_filter = lambda ht: ht[path_cutoff_field] | non_clinvar_filter(ht)

return af_filter

def _get_af_prefilter_field(self, af_cutoff):
return next((field for field, cutoff in self.FREQUENCY_PREFILTER_FIELDS.items() if af_cutoff <= cutoff), None)

def _get_annotation_override_filters(self, ht, annotation_overrides):
annotation_filters = super()._get_annotation_override_filters(ht, annotation_overrides)

if annotation_overrides.get(SCREEN_KEY):
allowed_consequences = hl.set(self._get_enum_terms_ids(SCREEN_KEY.lower(), 'region_type', annotation_overrides[SCREEN_KEY]))
annotation_filters.append(allowed_consequences.contains(ht.screen.region_type_ids.first()))
if annotation_overrides.get(SPLICE_AI_FIELD):
score_filter, _ = self._get_in_silico_filter(ht, SPLICE_AI_FIELD, annotation_overrides[SPLICE_AI_FIELD])
annotation_filters.append(score_filter)

return annotation_filters

@staticmethod
def _stat_has_non_ref(s):
return (s.het_samples > 0) | (s.hom_samples > 0)
117 changes: 110 additions & 7 deletions hail_search/queries/snv_indel_37.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,122 @@
from collections import OrderedDict
import hail as hl

from hail_search.constants import GENOME_VERSION_GRCh37, PREFILTER_FREQ_CUTOFF
from hail_search.queries.snv_indel import SnvIndelHailTableQuery
from hail_search.constants import CLINVAR_KEY, CLINVAR_MITO_KEY, HGMD_KEY, HGMD_PATH_RANGES, \
GNOMAD_GENOMES_FIELD, PREFILTER_FREQ_CUTOFF, PATH_FREQ_OVERRIDE_CUTOFF, PATHOGENICTY_SORT_KEY, PATHOGENICTY_HGMD_SORT_KEY, \
SPLICE_AI_FIELD, GENOME_VERSION_GRCh37
from hail_search.queries.base import PredictionPath, QualityFilterFormat
from hail_search.queries.mito import MitoHailTableQuery


class SnvIndelHailTableQuery37(SnvIndelHailTableQuery):
class SnvIndelHailTableQuery37(MitoHailTableQuery):

DATA_TYPE = 'SNV_INDEL'
GENOME_VERSION = GENOME_VERSION_GRCh37
PREDICTION_FIELDS_CONFIG = SnvIndelHailTableQuery.PREDICTION_FIELDS_CONFIG_ALL_BUILDS

GENOTYPE_FIELDS = {f.lower(): f for f in ['DP', 'GQ', 'AB']}
QUALITY_FILTER_FORMAT = {
'AB': QualityFilterFormat(override=lambda gt: ~gt.GT.is_het(), scale=100),
}
POPULATIONS = {
'seqr': {'hom': 'hom', 'hemi': None, 'het': None, 'sort': 'callset_af'},
'topmed': {'hemi': None},
'exac': {
'filter_af': 'AF_POPMAX', 'ac': 'AC_Adj', 'an': 'AN_Adj', 'hom': 'AC_Hom', 'hemi': 'AC_Hemi',
'het': 'AC_Het',
},
'gnomad_exomes': {'filter_af': 'AF_POPMAX_OR_GLOBAL', 'het': None, 'sort': 'gnomad_exomes'},
GNOMAD_GENOMES_FIELD: {'filter_af': 'AF_POPMAX_OR_GLOBAL', 'het': None, 'sort': 'gnomad'},
}
PREDICTION_FIELDS_CONFIG = {
'cadd': PredictionPath('cadd', 'PHRED'),
'eigen': PredictionPath('eigen', 'Eigen_phred'),
'mpc': PredictionPath('mpc', 'MPC'),
'primate_ai': PredictionPath('primate_ai', 'score'),
SPLICE_AI_FIELD: PredictionPath(SPLICE_AI_FIELD, 'delta_score'),
'splice_ai_consequence': PredictionPath(SPLICE_AI_FIELD, 'splice_consequence'),
'mut_taster': PredictionPath('dbnsfp', 'MutationTaster_pred'),
'polyphen': PredictionPath('dbnsfp', 'Polyphen2_HVAR_score'),
'revel': PredictionPath('dbnsfp', 'REVEL_score'),
'sift': PredictionPath('dbnsfp', 'SIFT_score'),
}
PATHOGENICITY_FILTERS = {
**MitoHailTableQuery.PATHOGENICITY_FILTERS,
HGMD_KEY: ('class', HGMD_PATH_RANGES),
}
PATHOGENICITY_FIELD_MAP = {}
ANNOTATION_OVERRIDE_FIELDS = [SPLICE_AI_FIELD]

LIFTOVER_ANNOTATION_FIELDS = {}
ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery.ANNOTATION_OVERRIDE_FIELDS[:-1]
BASE_ANNOTATION_FIELDS = {
k: v for k, v in MitoHailTableQuery.BASE_ANNOTATION_FIELDS.items()
if k not in MitoHailTableQuery.MITO_ANNOTATION_FIELDS
}
ENUM_ANNOTATION_FIELDS = {
**MitoHailTableQuery.ENUM_ANNOTATION_FIELDS,
'screen': {
'response_key': 'screenRegionType',
'format_value': lambda value: value.region_types.first(),
},
}
ENUM_ANNOTATION_FIELDS[CLINVAR_KEY] = ENUM_ANNOTATION_FIELDS.pop(CLINVAR_MITO_KEY)

SORTS = {
**MitoHailTableQuery.SORTS,
PATHOGENICTY_SORT_KEY: lambda r: [MitoHailTableQuery.CLINVAR_SORT(CLINVAR_KEY, r)],
PATHOGENICTY_HGMD_SORT_KEY: lambda r: [MitoHailTableQuery.CLINVAR_SORT(CLINVAR_KEY, r), r.hgmd.class_id],
}

FREQUENCY_PREFILTER_FIELDS = OrderedDict([
(True, PREFILTER_FREQ_CUTOFF),
('is_gt_10_percent', 0.1),
])

def _should_add_chr_prefix(self):
return False
def _prefilter_entries_table(self, ht, *args, **kwargs):
ht = super()._prefilter_entries_table(ht, *args, **kwargs)
if 'variant_ht' not in self._load_table_kwargs and not self._load_table_kwargs.get('_filter_intervals'):
af_ht = self._get_loaded_filter_ht(
GNOMAD_GENOMES_FIELD, 'high_af_variants.ht', self._get_gnomad_af_prefilter, **kwargs)
if af_ht:
ht = ht.filter(hl.is_missing(af_ht[ht.key]))
return ht

def _get_gnomad_af_prefilter(self, frequencies=None, pathogenicity=None, **kwargs):
gnomad_genomes_filter = (frequencies or {}).get(GNOMAD_GENOMES_FIELD, {})
af_cutoff = gnomad_genomes_filter.get('af')
if af_cutoff is None and gnomad_genomes_filter.get('ac') is not None:
af_cutoff = PREFILTER_FREQ_CUTOFF
if af_cutoff is None:
return False

af_cutoff_field = self._get_af_prefilter_field(af_cutoff)
if af_cutoff_field is None:
return False

af_filter = True if af_cutoff_field is True else lambda ht: ht[af_cutoff_field]

if af_cutoff < PATH_FREQ_OVERRIDE_CUTOFF:
clinvar_path_ht = self._get_loaded_clinvar_prefilter_ht(pathogenicity)
if clinvar_path_ht is not False:
path_cutoff_field = self._get_af_prefilter_field(PATH_FREQ_OVERRIDE_CUTOFF)
non_clinvar_filter = lambda ht: hl.is_missing(clinvar_path_ht[ht.key])
if af_filter is not True:
non_clinvar_filter = lambda ht: non_clinvar_filter(ht) & af_filter(ht)
af_filter = lambda ht: ht[path_cutoff_field] | non_clinvar_filter(ht)

return af_filter

def _get_af_prefilter_field(self, af_cutoff):
return next((field for field, cutoff in self.FREQUENCY_PREFILTER_FIELDS.items() if af_cutoff <= cutoff), None)

def _get_annotation_override_filters(self, ht, annotation_overrides):
annotation_filters = super()._get_annotation_override_filters(ht, annotation_overrides)

if annotation_overrides.get(SPLICE_AI_FIELD):
score_filter, _ = self._get_in_silico_filter(ht, SPLICE_AI_FIELD, annotation_overrides[SPLICE_AI_FIELD])
annotation_filters.append(score_filter)

return annotation_filters

@staticmethod
def _stat_has_non_ref(s):
return (s.het_samples > 0) | (s.hom_samples > 0)
2 changes: 1 addition & 1 deletion hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
'ENSG00000176227': [
{'aminoAcids': None, 'canonical': 1, 'codons': None, 'geneId': 'ENSG00000176227',
'hgvsc': 'ENST00000447022.1:n.1354A>G', 'hgvsp': None,
'transcriptId': 'ENST00000447022', 'isLofNagnag': None, 'transcriptRank': 1,
'transcriptId': 'ENST00000447022', 'isLofNagnag': None, 'transcriptRank': 0,
'biotype': 'processed_pseudogene', 'lofFilters': None, 'majorConsequence': 'non_coding_transcript_exon_variant'},
],
},
Expand Down
Loading

0 comments on commit 5344b0c

Please sign in to comment.