Skip to content

Commit

Permalink
Merge pull request #4446 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Oct 22, 2024
2 parents cc21f6e + fd3b005 commit 77e8464
Show file tree
Hide file tree
Showing 35 changed files with 330 additions and 296 deletions.
25 changes: 5 additions & 20 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,43 +624,28 @@ def _annotate_families_inheritance(
if genotype:
entry_indices_by_gt[genotype][family_index].append(sample_index)

min_unaffected = None
if inheritance_mode == COMPOUND_HET:
family_unaffected_counts = [
len(i) for i in entry_indices_by_gt[INHERITANCE_FILTERS[COMPOUND_HET][UNAFFECTED_ID]].values()
]
self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0
if self.max_unaffected_samples > 1:
min_unaffected = min(family_unaffected_counts)

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda i, family_samples: self._valid_genotype_family_entries(
family_samples, entry_indices.get(i), genotype, min_unaffected
)
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht

@classmethod
def _valid_genotype_family_entries(cls, entries: list, genotype_entry_indices, genotype: str, min_unaffected: int):
is_valid = hl.is_missing(genotype_entry_indices) | genotype_entry_indices.all(
lambda i: cls.GENOTYPE_QUERY_MAP[genotype](entries[i].GT)
)
if min_unaffected is not None and genotype == HAS_REF:
unaffected_filter = genotype_entry_indices.any(
lambda i: cls.GENOTYPE_QUERY_MAP[REF_REF](entries[i].GT)
)
if min_unaffected < 2:
unaffected_filter |= genotype_entry_indices.size() < 2
is_valid &= unaffected_filter
return hl.or_missing(is_valid, entries)

def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
quality_filter = quality_filter or {}

Expand Down
5 changes: 4 additions & 1 deletion matchmaker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@

class MatchmakerSubmission(ModelWithGUID):

SEX_LOOKUP = {Individual.SEX_MALE: 'MALE', Individual.SEX_FEMALE: 'FEMALE'}
SEX_LOOKUP = {
**{sex: 'MALE' for sex in Individual.MALE_SEXES},
**{sex: 'FEMALE' for sex in Individual.FEMALE_SEXES},
}

individual = models.OneToOneField(Individual, on_delete=models.PROTECT)

Expand Down
4 changes: 2 additions & 2 deletions seqr/fixtures/1kg_project.json
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@
"individual_id": "NA19675_1",
"mother_id": 3,
"father_id": 2,
"sex": "M",
"sex": "XXY",
"affected": "A",
"display_name": "",
"notes": "",
Expand Down Expand Up @@ -536,7 +536,7 @@
"individual_id": "HG00731",
"mother_id": 6,
"father_id": 5,
"sex": "F",
"sex": "X0",
"affected": "A",
"proband_relationship": "S",
"display_name": "HG00731_a",
Expand Down
8 changes: 4 additions & 4 deletions seqr/fixtures/report_variants.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,9 @@
"last_modified_date": "2018-05-31T16:36:02.805Z",
"xpos": 19001912632,
"xpos_end": 19001912632,
"ref": "GC",
"alt": "TT",
"variant_id": "19-1912632-GC-TT",
"ref": "G",
"alt": "C",
"variant_id": "19-1912632-G-C",
"saved_variant_json": {
"pos": 1912632,
"end": 1912632,
Expand All @@ -116,7 +116,7 @@
"genotypes": {
"I000004_hg00731": {"numAlt": 1}
},
"variantId": "19-1912632-GC-TT",
"variantId": "19-1912632-G-C",
"chrom": "19",
"mainTranscriptId": "ENST00000371839",
"transcripts": {
Expand Down
18 changes: 18 additions & 0 deletions seqr/migrations/0076_alter_individual_sex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.13 on 2024-10-10 20:51

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('seqr', '0075_alter_individual_primary_biosample'),
]

operations = [
migrations.AlterField(
model_name='individual',
name='sex',
field=models.CharField(choices=[('M', 'Male'), ('F', 'Female'), ('U', 'Unknown'), ('XXY', 'XXY'), ('XYY', 'XYY'), ('XXX', 'XXX'), ('X0', 'X0')], default='U', max_length=3),
),
]
8 changes: 7 additions & 1 deletion seqr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,10 +447,16 @@ class Individual(ModelWithGUID):
SEX_MALE = 'M'
SEX_FEMALE = 'F'
SEX_UNKNOWN = 'U'
FEMALE_ANEUPLOIDIES = ['XXX', 'X0']
MALE_ANEUPLOIDIES = ['XXY', 'XYY']
FEMALE_SEXES = [SEX_FEMALE] + FEMALE_ANEUPLOIDIES
MALE_SEXES = [SEX_MALE] + MALE_ANEUPLOIDIES
SEX_CHOICES = (
(SEX_MALE, 'Male'),
('F', 'Female'),
('U', 'Unknown'),
*[(sex, sex) for sex in MALE_ANEUPLOIDIES],
*[(sex, sex) for sex in FEMALE_ANEUPLOIDIES],
)

AFFECTED_STATUS_AFFECTED = 'A'
Expand Down Expand Up @@ -603,7 +609,7 @@ class Individual(ModelWithGUID):
mother = models.ForeignKey('seqr.Individual', null=True, blank=True, on_delete=models.SET_NULL, related_name='maternal_children')
father = models.ForeignKey('seqr.Individual', null=True, blank=True, on_delete=models.SET_NULL, related_name='paternal_children')

sex = models.CharField(max_length=1, choices=SEX_CHOICES, default='U')
sex = models.CharField(max_length=3, choices=SEX_CHOICES, default='U')
affected = models.CharField(max_length=1, choices=AFFECTED_STATUS_CHOICES, default=AFFECTED_STATUS_UNKNOWN)

# TODO once sample and individual ids are fully decoupled no reason to maintain this field
Expand Down
4 changes: 3 additions & 1 deletion seqr/utils/search/add_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def format_email(sample_summary, project_link, num_new_samples):

def prepare_data_loading_request(projects: list[Project], sample_type: str, dataset_type: str, genome_version: str,
data_path: str, user: User, pedigree_dir: str, raise_pedigree_error: bool = False,
individual_ids: list[str] = None):
individual_ids: list[str] = None, skip_validation: bool = False):
project_guids = sorted([p.guid for p in projects])
variables = {
'projects_to_run': project_guids,
Expand All @@ -125,6 +125,8 @@ def prepare_data_loading_request(projects: list[Project], sample_type: str, data
'dataset_type': _dag_dataset_type(sample_type, dataset_type),
'reference_genome': GENOME_VERSION_LOOKUP[genome_version],
}
if skip_validation:
variables['skip_validation'] = True
file_path = _get_pedigree_path(pedigree_dir, genome_version, sample_type, dataset_type)
_upload_data_loading_files(projects, user, file_path, individual_ids, raise_pedigree_error)
return variables, file_path
Expand Down
2 changes: 1 addition & 1 deletion seqr/utils/search/hail_search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _get_sample_data(samples, inheritance_filter=None, inheritance_mode=None, **
affected=F('individual__affected'),
)
if inheritance_mode == X_LINKED_RECESSIVE:
sample_values['is_male'] = Case(When(individual__sex=Individual.SEX_MALE, then=True), default=False)
sample_values['is_male'] = Case(When(individual__sex__in=Individual.MALE_SEXES, then=True), default=False)
sample_data = samples.order_by('guid').values('individual__individual_id', 'dataset_type', 'sample_type', **sample_values)

custom_affected = (inheritance_filter or {}).pop('affected', None)
Expand Down
2 changes: 1 addition & 1 deletion seqr/utils/search/search_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ def test_get_variant_query_gene_counts(self, mock_call):
def test_cached_get_variant_query_gene_counts(self):
super(HailSearchUtilsTests, self).test_cached_get_variant_query_gene_counts()

self.set_cache({'all_results': PARSED_COMPOUND_HET_VARIANTS_MULTI_PROJECT + [SV_VARIANT1], 'total_results': 3})
self.set_cache({'all_results': [PARSED_COMPOUND_HET_VARIANTS_MULTI_PROJECT] + [SV_VARIANT1], 'total_results': 2})
gene_counts = get_variant_query_gene_counts(self.results_model, self.user)
self.assertDictEqual(gene_counts, {
'ENSG00000135953': {'total': 2, 'families': {'F000003_3': 2, 'F000011_11': 2}},
Expand Down
7 changes: 6 additions & 1 deletion seqr/utils/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,12 @@ def get_variant_query_gene_counts(search_model, user):

def _get_gene_aggs_for_cached_variants(previous_search_results):
gene_aggs = defaultdict(lambda: {'total': 0, 'families': defaultdict(int)})
for var in previous_search_results['all_results']:
# ES caches compound hets separately from main results, hail search caches everything together
flattened_variants = backend_specific_call(
lambda results: results,
lambda results: [v for variants in results for v in (variants if isinstance(variants, list) else [variants])],
)(previous_search_results['all_results'])
for var in flattened_variants:
# ES only reports breakdown for main transcript gene only, hail backend reports for all genes
gene_ids = backend_specific_call(
lambda variant_transcripts: next((
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/anvil_workspace_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ def _assert_valid_operation(self, project, test_add_data=True):
['R0001_1kg', 'F000001_1', '1', 'NA19675_1', 'NA19678', '', 'F'],
['R0001_1kg', 'F000001_1', '1', 'NA19678', '', '', 'M'],
['R0001_1kg', 'F000001_1', '1', 'NA19679', '', '', 'F'],
['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'F'],
['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'X0'],
['R0001_1kg', 'F000002_2', '2', 'HG00732', '', '', 'M'],
['R0001_1kg', 'F000002_2', '2', 'HG00733', '', '', 'F'],
['R0001_1kg', 'F000003_3', '3', 'NA20870', '', '', 'M'],
Expand Down
11 changes: 7 additions & 4 deletions seqr/views/apis/data_manager_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,8 +448,9 @@ def load_phenotype_prioritization_data(request):
@pm_or_data_manager_required
def validate_callset(request):
request_json = json.loads(request.body)
dataset_type = request_json.get('datasetType', Sample.DATASET_TYPE_VARIANT_CALLS)
validate_vcf_exists(
_callset_path(request_json), request.user, allowed_exts=DATA_TYPE_FILE_EXTS.get(request_json['datasetType']),
_callset_path(request_json), request.user, allowed_exts=DATA_TYPE_FILE_EXTS.get(dataset_type),
path_name=request_json['filePath'],
)
return create_json_response({'success': True})
Expand Down Expand Up @@ -516,7 +517,8 @@ def _fetch_airtable_loadable_project_samples(user, dataset_type, sample_type):
def load_data(request):
request_json = json.loads(request.body)
sample_type = request_json['sampleType']
dataset_type = request_json['datasetType']
dataset_type = request_json.get('datasetType', Sample.DATASET_TYPE_VARIANT_CALLS)
skip_validation = request_json.get('skipValidation', False)
projects = [json.loads(project) for project in request_json['projects']]
project_samples = {p['projectGuid']: p.get('sampleIds') for p in projects}

Expand All @@ -528,17 +530,18 @@ def load_data(request):
loading_args = (
project_models, sample_type, dataset_type, request_json['genomeVersion'], _callset_path(request_json),
)
loading_kwargs = {'user': request.user, 'skip_validation': skip_validation}
if AirtableSession.is_airtable_enabled():
individual_ids = _get_valid_project_samples(project_samples, dataset_type, sample_type, request.user)
success_message = f'*{request.user.email}* triggered loading internal {sample_type} {dataset_type} data for {len(projects)} projects'
error_message = f'ERROR triggering internal {sample_type} {dataset_type} loading'
trigger_airflow_data_loading(
*loading_args, user=request.user, success_message=success_message, error_message=error_message,
*loading_args, **loading_kwargs, success_message=success_message, error_message=error_message,
success_slack_channel=SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, is_internal=True, individual_ids=individual_ids,
)
else:
request_json, _ = prepare_data_loading_request(
*loading_args, user=request.user, pedigree_dir=LOADING_DATASETS_DIR, raise_pedigree_error=True,
*loading_args, **loading_kwargs, pedigree_dir=LOADING_DATASETS_DIR, raise_pedigree_error=True,
)
response = requests.post(f'{PIPELINE_RUNNER_SERVER}/loading_pipeline_enqueue', json=request_json, timeout=60)
if response.status_code == 409:
Expand Down
26 changes: 17 additions & 9 deletions seqr/views/apis/data_manager_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,10 +390,10 @@

PEDIGREE_HEADER = ['Project_GUID', 'Family_GUID', 'Family_ID', 'Individual_ID', 'Paternal_ID', 'Maternal_ID', 'Sex']
EXPECTED_PEDIGREE_ROWS = [
['R0001_1kg', 'F000001_1', '1', 'NA19675_1', 'NA19678', 'NA19679', 'M'],
['R0001_1kg', 'F000001_1', '1', 'NA19675_1', 'NA19678', 'NA19679', 'XXY'],
['R0001_1kg', 'F000001_1', '1', 'NA19678', '', '', 'M'],
['R0001_1kg', 'F000001_1', '1', 'NA19679', '', '', 'F'],
['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'F'],
['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'X0'],
]

PROJECT_OPTION = {
Expand Down Expand Up @@ -1433,7 +1433,7 @@ def test_validate_callset(self, mock_subprocess, mock_glob, mock_os_isfile):
mock_subprocess.return_value.communicate.return_value = (
b'', b'CommandException: One or more URLs matched no objects.',
)
body = {'filePath': f'{self.CALLSET_DIR}/sharded_vcf/part0*.vcf', 'datasetType': 'SNV_INDEL'}
body = {'filePath': f'{self.CALLSET_DIR}/sharded_vcf/part0*.vcf'}
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertListEqual(
Expand Down Expand Up @@ -1503,7 +1503,7 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
mock_temp_dir.return_value.__enter__.return_value = '/mock/tmp'
body = {'filePath': f'{self.CALLSET_DIR}/mito_callset.mt', 'datasetType': 'MITO', 'sampleType': 'WES', 'genomeVersion': '38', 'projects': [
json.dumps(option) for option in self.PROJECT_OPTIONS + [{'projectGuid': 'R0005_not_project'}]
]}
], 'skipValidation': True}
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertDictEqual(response.json(), {'error': 'The following projects are invalid: R0005_not_project'})
Expand All @@ -1516,7 +1516,7 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
)
self.assertDictEqual(response.json(), {'success': True})

self._assert_expected_load_data_requests(sample_type='WES')
self._assert_expected_load_data_requests(sample_type='WES', skip_validation=True)
self._has_expected_ped_files(mock_open, mock_mkdir, 'MITO', sample_type='WES')

dag_json = {
Expand All @@ -1528,6 +1528,7 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
'sample_type': 'WES',
'dataset_type': 'MITO',
'reference_genome': 'GRCh38',
'skip_validation': True,
}
self._assert_success_notification(dag_json)

Expand All @@ -1538,14 +1539,17 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
responses.calls.reset()
self.reset_logs()

del body['skipValidation']
del dag_json['skip_validation']
body.update({'datasetType': 'SV', 'filePath': f'{self.CALLSET_DIR}/sv_callset.vcf'})
self._trigger_error(url, body, dag_json, mock_open, mock_mkdir)

responses.add(responses.POST, PIPELINE_RUNNER_URL)
responses.calls.reset()
mock_open.reset_mock()
mock_mkdir.reset_mock()
body.update({'datasetType': 'SNV_INDEL', 'sampleType': 'WGS', 'projects': [json.dumps(self.PROJECT_OPTION)]})
body.update({'sampleType': 'WGS', 'projects': [json.dumps(self.PROJECT_OPTION)]})
del body['datasetType']
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self._test_load_single_project(mock_open, mock_mkdir, response, url=url, body=body)

Expand Down Expand Up @@ -1638,18 +1642,21 @@ def _add_file_iter(self, stdout):
def _assert_expected_get_projects_requests(self):
self.assertEqual(len(responses.calls), 0)

def _assert_expected_load_data_requests(self, dataset_type='MITO', sample_type='WGS', trigger_error=False, skip_project=False):
def _assert_expected_load_data_requests(self, dataset_type='MITO', sample_type='WGS', trigger_error=False, skip_project=False, skip_validation=False):
self.assertEqual(len(responses.calls), 1)
projects = [PROJECT_GUID, NON_ANALYST_PROJECT_GUID]
if skip_project:
projects = projects[1:]
self.assertDictEqual(json.loads(responses.calls[0].request.body), {
body = {
'projects_to_run': projects,
'callset_path': '/local_datasets/sv_callset.vcf' if trigger_error else '/local_datasets/mito_callset.mt',
'sample_type': sample_type,
'dataset_type': dataset_type,
'reference_genome': 'GRCh38',
})
}
if skip_validation:
body['skip_validation'] = True
self.assertDictEqual(json.loads(responses.calls[0].request.body), body)

@staticmethod
def _local_pedigree_path(dataset_type, sample_type):
Expand Down Expand Up @@ -1786,6 +1793,7 @@ def _get_dag_variable_overrides(*args, **kwargs):
'sample_source': 'Broad_Internal',
'sample_type': 'WES',
'dataset_type': 'MITO',
'skip_validation': True,
}

def _assert_expected_load_data_requests(self, dataset_type='MITO', **kwargs):
Expand Down
8 changes: 1 addition & 7 deletions seqr/views/apis/individual_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,6 @@
from seqr.views.utils.individual_utils import delete_individuals, add_or_update_individuals_and_families
from seqr.views.utils.variant_utils import bulk_create_tagged_variants

_SEX_TO_EXPORTED_VALUE = dict(Individual.SEX_LOOKUP)
_SEX_TO_EXPORTED_VALUE['U'] = ''

__AFFECTED_TO_EXPORTED_VALUE = dict(Individual.AFFECTED_STATUS_LOOKUP)
__AFFECTED_TO_EXPORTED_VALUE['U'] = ''


@login_and_policies_required
def update_individual_handler(request, individual_guid):
Expand Down Expand Up @@ -875,7 +869,7 @@ def import_gregor_metadata(request, project_guid):
genes = set()
for row in _iter_metadata_table(
metadata_files_path, FINDINGS_TABLE, request.user,
lambda r: r['participant_id'] in participant_individual_map and r['variant_type'] == 'SNV/INDEL',
lambda r: r['participant_id'] in participant_individual_map and r['variant_type'] in {'SNV/INDEL', 'SNV', 'INDEL'},
):
individual = participant_individual_map[row['participant_id']]
variant_id = '-'.join([row[col] for col in ['chrom', 'pos', 'ref', 'alt']])
Expand Down
Loading

0 comments on commit 77e8464

Please sign in to comment.