Skip to content

Commit

Permalink
Merge pull request #4206 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Jun 27, 2024
2 parents 3166cf6 + 64b0520 commit e7144d0
Show file tree
Hide file tree
Showing 51 changed files with 306 additions and 169 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@ on:
inputs:
environment:
type: choice
options:
options:
- dev
- prod
reference_genome:
type: choice
description: Reference Genome
options:
options:
- GRCh37
- GRCh38
required: true
dataset_type:
type: choice
description: Dataset Type
options:
options:
- SNV_INDEL
- MITO
- GCNV
Expand All @@ -28,6 +28,9 @@ on:
volume_handle:
required: true

env:
CHART_NAME: "${{ inputs.environment == 'dev' && 'dev-' || '' }}broad-seqr"

jobs:
helm_update:
runs-on: ubuntu-latest
Expand All @@ -46,13 +49,13 @@ jobs:
uses: mikefarah/yq@v4.22.1
with:
cmd: >
yq -i '.global.hail_search.datasetVersions.${{ inputs.reference_genome }}/${{ inputs.dataset_type }} = "${{ inputs.version }}"' charts/broad-seqr/values-${{ inputs.environment }}.yaml
yq -i '.global.hail_search.datasetVersions.${{ inputs.reference_genome }}/${{ inputs.dataset_type }} = "${{ inputs.version }}"' charts/${{ env.CHART_NAME }}/values.yaml
- name: update volume handle in the broad-seqr chart
uses: mikefarah/yq@v4.22.1
with:
cmd: >
yq -i '.hail-search.persistentVolume.volumeHandle = "${{ inputs.volume_handle }}"' charts/broad-seqr/values-${{ inputs.environment }}.yaml
yq -i '.hail-search.persistentVolume.volumeHandle = "${{ inputs.volume_handle }}"' charts/${{ env.CHART_NAME }}/values.yaml
- name: Commit and Push changes
uses: Andro999b/push@v1.3
Expand Down
6 changes: 5 additions & 1 deletion hail_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
SPLICE_AI_FIELD = 'splice_ai'
NEW_SV_FIELD = 'new_structural_variants'
SCREEN_KEY = 'SCREEN' # uses all caps to match filter provided by the seqr UI
UTR_ANNOTATOR_KEY = 'UTRAnnotator'
EXTENDED_SPLICE_KEY = 'extended_splice_site'
MOTIF_FEATURES_KEY = 'motif_feature'
REGULATORY_FEATURES_KEY = 'regulatory_feature'
CLINVAR_KEY = 'clinvar'
CLINVAR_MITO_KEY = 'clinvar_mito'
HGMD_KEY = 'hgmd'
Expand All @@ -23,7 +27,7 @@
GENOTYPES_FIELD = 'genotypes'

ANNOTATION_OVERRIDE_FIELDS = [
SCREEN_KEY, SPLICE_AI_FIELD, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD,
SCREEN_KEY, SPLICE_AI_FIELD, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD, MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY,
]
ALLOWED_TRANSCRIPTS = 'allowed_transcripts'
ALLOWED_SECONDARY_TRANSCRIPTS = 'allowed_transcripts_secondary'
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.126-ee77707f4fab
Created at 2024/01/24 11:38:19
Written with version 0.2.128-eead8100a1c1
Created at 2024/06/27 14:14:27
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/04/03 17:41:01
Created at 2024/06/27 14:07:54
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
10 changes: 7 additions & 3 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,16 +148,20 @@ def population_expression(self, r, population):
for response_key, field in pop_config.items() if field is not None
})

def _get_enum_lookup(self, field, subfield):
def _get_enum_lookup(self, field, subfield, nested_subfield=None):
enum_field = self._enums.get(field, {})
if subfield:
enum_field = enum_field.get(subfield)
if nested_subfield:
enum_field = enum_field.get(nested_subfield)
if enum_field is None:
return None
return {v: i for i, v in enumerate(enum_field)}

def _get_enum_terms_ids(self, field, subfield, terms):
enum = self._get_enum_lookup(field, subfield)
def _get_enum_terms_ids(self, field, subfield, terms, nested_subfield=None):
if not terms:
return set()
enum = self._get_enum_lookup(field, subfield, nested_subfield=nested_subfield)
return {enum[t] for t in terms if enum.get(t) is not None}

def _format_enum_response(self, k, enum):
Expand Down
2 changes: 1 addition & 1 deletion hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):

annotation_fields.update({
'familyGenotypes': lambda r: hl.dict(r.family_entries.map(
lambda entries: (entries.first().familyGuid, entries.map(self._get_sample_genotype))
lambda entries: (entries.first().familyGuid, entries.filter(hl.is_defined).map(self._get_sample_genotype))
)),
})

Expand Down
11 changes: 10 additions & 1 deletion hail_search/queries/ont_snv_indel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from aiohttp.web import HTTPBadRequest

from hail_search.constants import EXTENDED_SPLICE_KEY, UTR_ANNOTATOR_KEY, SCREEN_KEY
from hail_search.queries.base import BaseHailTableQuery
from hail_search.queries.snv_indel import SnvIndelHailTableQuery
from hail_search.queries.snv_indel_37 import SnvIndelHailTableQuery37
Expand All @@ -10,13 +11,21 @@ class OntSnvIndelHailTableQuery(SnvIndelHailTableQuery):
DATA_TYPE = 'ONT_SNV_INDEL'

CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS
ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery37.ANNOTATION_OVERRIDE_FIELDS + [SCREEN_KEY]

def _get_loaded_filter_ht(self, *args, **kwargs):
return None

def _add_project_lookup_data(self, *args, **kwargs):
raise HTTPBadRequest(reason='Variant lookup is not supported for ONT data')

def _get_allowed_consequence_ids(self, annotations):
return super()._get_allowed_consequence_ids({
k: v for k, v in annotations.items() if k not in {EXTENDED_SPLICE_KEY, UTR_ANNOTATOR_KEY}
})

@staticmethod
def _get_allowed_transcripts_filter(allowed_consequence_ids):
return SnvIndelHailTableQuery37._get_allowed_transcripts_filter(allowed_consequence_ids)
return SnvIndelHailTableQuery37._get_allowed_transcripts_filter(
allowed_consequence_ids.get(SnvIndelHailTableQuery37.TRANSCRIPT_CONSEQUENCE_FIELD)
)
58 changes: 46 additions & 12 deletions hail_search/queries/snv_indel.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections import OrderedDict
import hail as hl

from hail_search.constants import GENOME_VERSION_GRCh38, SCREEN_KEY, PREFILTER_FREQ_CUTOFF, ALPHAMISSENSE_SORT
from hail_search.constants import GENOME_VERSION_GRCh38, SCREEN_KEY, PREFILTER_FREQ_CUTOFF, ALPHAMISSENSE_SORT, \
UTR_ANNOTATOR_KEY, EXTENDED_SPLICE_KEY, MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY
from hail_search.queries.base import BaseHailTableQuery, PredictionPath
from hail_search.queries.snv_indel_37 import SnvIndelHailTableQuery37

Expand All @@ -19,7 +20,9 @@ class SnvIndelHailTableQuery(SnvIndelHailTableQuery37):
'gnomad_noncoding': PredictionPath('gnomad_non_coding_constraint', 'z_score'),
}
LIFTOVER_ANNOTATION_FIELDS = BaseHailTableQuery.LIFTOVER_ANNOTATION_FIELDS
ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery37.ANNOTATION_OVERRIDE_FIELDS + [SCREEN_KEY]
ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery37.ANNOTATION_OVERRIDE_FIELDS + [
SCREEN_KEY, MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY,
]
FREQUENCY_PREFILTER_FIELDS = OrderedDict([
(True, PREFILTER_FREQ_CUTOFF),
('is_gt_3_percent', 0.03),
Expand All @@ -37,21 +40,42 @@ class SnvIndelHailTableQuery(SnvIndelHailTableQuery37):
}

def _get_allowed_consequence_ids(self, annotations):
consequence_ids = super()._get_allowed_consequence_ids(annotations)
if EXTENDED_SPLICE_REGION_CONSEQUENCE in (annotations.get('extended_splice_site') or []):
consequence_ids.add(EXTENDED_SPLICE_REGION_CONSEQUENCE)
return consequence_ids
parsed_allowed_consequences = {}
allowed_consequence_ids = super()._get_allowed_consequence_ids(annotations)
if allowed_consequence_ids:
parsed_allowed_consequences[self.TRANSCRIPT_CONSEQUENCE_FIELD] = allowed_consequence_ids

utr_consequence_ids = self._get_enum_terms_ids(
self.TRANSCRIPTS_FIELD, subfield='utrannotator', nested_subfield='fiveutr_consequence',
terms=(annotations.get(UTR_ANNOTATOR_KEY) or []),
)
if utr_consequence_ids:
parsed_allowed_consequences[UTR_ANNOTATOR_KEY] = utr_consequence_ids

if EXTENDED_SPLICE_REGION_CONSEQUENCE in (annotations.get(EXTENDED_SPLICE_KEY) or []):
parsed_allowed_consequences[EXTENDED_SPLICE_REGION_CONSEQUENCE] = True

return parsed_allowed_consequences

@staticmethod
def _get_allowed_transcripts_filter(allowed_consequence_ids):
has_extended_splice = EXTENDED_SPLICE_REGION_CONSEQUENCE in allowed_consequence_ids
allowed_consequence_ids = allowed_consequence_ids - {EXTENDED_SPLICE_REGION_CONSEQUENCE}
allowed_consequence_filter = SnvIndelHailTableQuery37._get_allowed_transcripts_filter(allowed_consequence_ids)
allowed_consequence_filters = []

if not has_extended_splice:
return allowed_consequence_filter
consequence_ids = allowed_consequence_ids.get(SnvIndelHailTableQuery37.TRANSCRIPT_CONSEQUENCE_FIELD)
if consequence_ids:
allowed_consequence_filters.append(SnvIndelHailTableQuery37._get_allowed_transcripts_filter(consequence_ids))

return lambda tc: allowed_consequence_filter(tc) | tc.spliceregion.extended_intronic_splice_region_variant
utr_consequences = allowed_consequence_ids.get(UTR_ANNOTATOR_KEY)
if utr_consequences:
utr_consequences = hl.set(utr_consequences)
allowed_consequence_filters.append(lambda tc: utr_consequences.contains(tc.utrannotator.fiveutr_consequence_id))

if allowed_consequence_ids.get(EXTENDED_SPLICE_REGION_CONSEQUENCE):
allowed_consequence_filters.append(lambda tc: tc.spliceregion.extended_intronic_splice_region_variant)

return allowed_consequence_filters[0] if len(allowed_consequence_filters) == 1 else lambda tc: hl.any([
f(tc) for f in allowed_consequence_filters
])

def _get_annotation_override_filters(self, ht, annotation_overrides):
annotation_filters = super()._get_annotation_override_filters(ht, annotation_overrides)
Expand All @@ -60,4 +84,14 @@ def _get_annotation_override_filters(self, ht, annotation_overrides):
allowed_consequences = hl.set(self._get_enum_terms_ids(SCREEN_KEY.lower(), 'region_type', annotation_overrides[SCREEN_KEY]))
annotation_filters.append(allowed_consequences.contains(ht.screen.region_type_ids.first()))

for feature_key in [MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY]:
if annotation_overrides.get(feature_key):
field = f'sorted_{feature_key}_consequences'
allowed_consequences = hl.set(self._get_enum_terms_ids(
field, self.TRANSCRIPT_CONSEQUENCE_FIELD, annotation_overrides[feature_key]),
)
annotation_filters.append(
ht[field].any(lambda c: c.consequence_term_ids.any(allowed_consequences.contains))
)

return annotation_filters
13 changes: 8 additions & 5 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@
'I000004_hg00731': {
'sampleId': 'HG00731', 'sampleType': 'WGS', 'individualGuid': 'I000004_hg00731',
'familyGuid': 'F000002_2', 'numAlt': 2, 'dp': 16, 'gq': 48, 'ab': 1,
}, 'I000005_hg00732': {
'sampleId': 'HG00732', 'sampleType': 'WGS', 'individualGuid': 'I000005_hg00732',
'familyGuid': 'F000002_2', 'numAlt': 0, 'dp': 2, 'gq': 6, 'ab': 0,
}, 'I000006_hg00733': {
'sampleId': 'HG00733', 'sampleType': 'WGS', 'individualGuid': 'I000006_hg00733',
'familyGuid': 'F000002_2', 'numAlt': 1, 'dp': 49, 'gq': 99, 'ab': 0.6530612111091614,
Expand Down Expand Up @@ -837,9 +834,10 @@ async def test_annotations_filter(self):
)

pathogenicity['clinvar'] = pathogenicity['clinvar'][:1]
annotations = {'SCREEN': ['CTCF-only', 'DNase-only']}
annotations = {'SCREEN': ['CTCF-only', 'DNase-only'], 'UTRAnnotator': ['5_prime_UTR_stop_codon_loss_variant']}
selected_transcript_variant_2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000408919'}
await self._assert_expected_search(
[VARIANT1, VARIANT4, MITO_VARIANT3], pathogenicity=pathogenicity, annotations=annotations,
[VARIANT1, selected_transcript_variant_2, VARIANT4, MITO_VARIANT3], pathogenicity=pathogenicity, annotations=annotations,
sample_data=FAMILY_2_ALL_SAMPLE_DATA,
)

Expand Down Expand Up @@ -896,6 +894,11 @@ async def test_annotations_filter(self):
pathogenicity=pathogenicity, annotations=annotations, sample_data=FAMILY_2_ALL_SAMPLE_DATA,
)

annotations = {'motif_feature': ['TF_binding_site_variant'], 'regulatory_feature': ['regulatory_region_variant']}
await self._assert_expected_search(
[VARIANT3, VARIANT4], annotations=annotations, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA,
)

async def test_secondary_annotations_filter(self):
annotations_1 = {'missense': ['missense_variant']}
annotations_2 = {'other': ['intron_variant']}
Expand Down
4 changes: 2 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pytz==2022.7.1
# django
rcssmin==1.1.1
# via django-compressor
requests==2.32.0
requests==2.32.2
# via
# -c requirements.txt
# responses
Expand All @@ -74,7 +74,7 @@ tomli==2.0.1
# pyproject-hooks
types-toml==0.10.8.5
# via responses
urllib3==1.26.18
urllib3==1.26.19
# via
# -c requirements.txt
# requests
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ tenacity==8.3.0
# via -r requirements.in
tqdm==4.66.3
# via -r requirements.in
urllib3==1.26.18
urllib3==1.26.19
# via
# elasticsearch
# requests
Expand Down
1 change: 1 addition & 0 deletions seqr/views/apis/analysis_group_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def update_analysis_group_handler(request, project_guid, analysis_group_guid=Non
valid_families = set()

def _validate_families(request_json):
request_json.pop('uploadedFamilyIds', None)
family_guids = request_json.pop('familyGuids')
families = Family.objects.filter(guid__in=family_guids).only('guid')
if len(families) != len(family_guids):
Expand Down
4 changes: 3 additions & 1 deletion seqr/views/apis/analysis_group_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ def test_create_update_and_delete_analysis_group(self):

# send valid request to create analysis_group
response = self.client.post(create_analysis_group_url, content_type='application/json', data=json.dumps({
'name': 'new_analysis_group', 'familyGuids': ['F000001_1', 'F000002_2']
'name': 'new_analysis_group', 'familyGuids': ['F000001_1', 'F000002_2'], 'uploadedFamilyIds': {
'info': ["Uploaded 2 families"], 'parsedData': [['F000001_1'], ['F000002_2']],
},
}))
self.assertEqual(response.status_code, 200)
new_analysis_group_response = response.json()
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/data_manager_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def _save_sample_data(sample_key, sample_data):
)

if sample_guids_to_keys:
persist_temp_file(file_name_prefix, request.user, is_directory=True)
persist_temp_file(file_name_prefix, request.user)

return create_json_response({
'info': info,
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/data_manager_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1525,7 +1525,7 @@ def _get_expected_read_file_subprocess_calls(self, file_name, sample_guid):

@staticmethod
def _additional_expected_loading_subprocess_calls(file_path):
return [f'gsutil mv tmp/temp_uploads/{file_path}/* gs://seqr-scratch-temp/{file_path}']
return [f'gsutil mv tmp/temp_uploads/{file_path} gs://seqr-scratch-temp/{file_path}']

def _assert_expected_es_status(self, response):
self.assertEqual(response.status_code, 400)
Expand Down
22 changes: 17 additions & 5 deletions seqr/views/apis/summary_data_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,19 +182,31 @@ def bulk_update_family_external_analysis(request):

def _load_aip_data(data: dict, user: User):
category_map = data['metadata']['categories']
projects = data['metadata'].get('projects')
results = data['results']

family_id_map = dict(Individual.objects.filter(
family__project__in=get_internal_projects(), individual_id__in=results.keys(),
).values_list('individual_id', 'family_id'))
if not projects:
raise ErrorsWarningsException(['No projects specified in the metadata'])

family_id_map = defaultdict(list)
for individual_id, family_id in Individual.objects.filter(
family__project__in=get_internal_projects().filter(name__in=projects), individual_id__in=results.keys(),
).values_list('individual_id', 'family_id'):
family_id_map[individual_id].append(family_id)
errors = []
missing_individuals = set(results.keys()) - set(family_id_map.keys())
if missing_individuals:
raise ErrorsWarningsException([f'Unable to find the following individuals: {", ".join(sorted(missing_individuals))}'])
errors.append(f'Unable to find the following individuals: {", ".join(sorted(missing_individuals))}')
multi_family_individuals = {individual_id for individual_id, families in family_id_map.items() if len(families) > 1}
if multi_family_individuals:
errors.append(f'The following individuals are found in multiple families: {", ".join(sorted(multi_family_individuals))}')
if errors:
raise ErrorsWarningsException(errors)

family_variant_data = {}
for family_id, variant_pred in results.items():
family_variant_data.update({
(family_id_map[family_id], variant_id): pred for variant_id, pred in variant_pred.items()
(family_id_map[family_id][0], variant_id): pred for variant_id, pred in variant_pred.items()
})

today = datetime.now().strftime('%Y-%m-%d')
Expand Down
12 changes: 11 additions & 1 deletion seqr/views/apis/summary_data_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,9 +505,19 @@ def test_bulk_update_family_external_analysis(self, mock_load_uploaded_file, moc
body['dataType'] = 'AIP'
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertEqual(response.json()['errors'], ['No projects specified in the metadata'])

aip_upload['metadata']['projects'] = ['1kg project nåme with uniçøde', 'Test Reprocessed Project']
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertEqual(response.json()['errors'], ['Unable to find the following individuals: SAM_123'])

aip_upload['results']['NA20889'] = aip_upload['results'].pop('SAM_123')
aip_upload['results']['NA20870'] = aip_upload['results'].pop('SAM_123')
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertEqual(response.json()['errors'], ['The following individuals are found in multiple families: NA20870'])

aip_upload['results']['NA20889'] = aip_upload['results'].pop('NA20870')
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertEqual(response.json()['errors'], [
Expand Down
Loading

0 comments on commit e7144d0

Please sign in to comment.