Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/broadinstitute/seqr into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
hanars committed Dec 4, 2024
2 parents f46b042 + 04d8860 commit dbef705
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 100 deletions.
13 changes: 12 additions & 1 deletion seqr/utils/search/add_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
from seqr.models import Sample, Individual, Project
from seqr.utils.communication_utils import send_project_notification, safe_post_to_slack
from seqr.utils.logging_utils import SeqrLogger
from seqr.utils.middleware import ErrorsWarningsException
from seqr.utils.search.utils import backend_specific_call
from seqr.utils.search.elasticsearch.es_utils import validate_es_index_metadata_and_get_samples
from seqr.views.utils.airtable_utils import AirtableSession, ANVIL_REQUEST_TRACKING_TABLE
from seqr.views.utils.dataset_utils import match_and_update_search_samples, load_mapping_file
from seqr.views.utils.export_utils import write_multiple_files
from seqr.views.utils.pedigree_info_utils import get_no_affected_families
from settings import SEQR_SLACK_DATA_ALERTS_NOTIFICATION_CHANNEL, BASE_URL, ANVIL_UI_URL, \
SEQR_SLACK_ANVIL_DATA_LOADING_CHANNEL

Expand Down Expand Up @@ -144,14 +146,23 @@ def _upload_data_loading_files(projects: list[Project], user: User, file_path: s
'Individual_ID': F('individual_id'),
'Paternal_ID': F('father__individual_id'), 'Maternal_ID': F('mother__individual_id'), 'Sex': F('sex'),
})
annotations = {'project': F('family__project__guid'), **file_annotations}
annotations = {'project': F('family__project__guid'), 'affected_status': F('affected'), **file_annotations}
individual_filter = {'id__in': individual_ids} if individual_ids else {'family__project__in': projects}
data = Individual.objects.filter(**individual_filter).order_by('family_id', 'individual_id').values(
**dict(annotations))

data_by_project = defaultdict(list)
affected_by_family = defaultdict(list)
for row in data:
data_by_project[row.pop('project')].append(row)
affected_by_family[row['Family_GUID']].append(row.pop('affected_status'))

no_affected_families =get_no_affected_families(affected_by_family)
if no_affected_families:
families = ', '.join(sorted(no_affected_families))
raise ErrorsWarningsException(errors=[
f'The following families have no affected individuals and can not be loaded to seqr: {families}',
])

header = list(file_annotations.keys())
files = [(f'{project_guid}_pedigree', header, rows) for project_guid, rows in data_by_project.items()]
Expand Down
21 changes: 5 additions & 16 deletions seqr/views/apis/anvil_workspace_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def create_project_from_workspace(request, namespace, name):
error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
return create_json_response({'error': error}, status=400, reason=error)

pedigree_records, _ = _parse_uploaded_pedigree(request_json)
pedigree_records = _parse_uploaded_pedigree(request_json)

# Create a new Project in seqr
project_args = {
Expand Down Expand Up @@ -229,7 +229,7 @@ def add_workspace_data(request, project_guid):
error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
return create_json_response({'error': error}, status=400, reason=error)

pedigree_records, records_by_family = _parse_uploaded_pedigree(request_json, project=project)
pedigree_records = _parse_uploaded_pedigree(request_json, project=project)

previous_samples = get_search_samples([project]).filter(dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS)
sample = previous_samples.first()
Expand All @@ -239,8 +239,9 @@ def add_workspace_data(request, project_guid):
}, status=400)
sample_type = sample.sample_type

families = {record[JsonConstants.FAMILY_ID_COLUMN] for record in pedigree_records}
previous_loaded_individuals = previous_samples.filter(
individual__family__family_id__in=records_by_family,
individual__family__family_id__in=families,
).values_list('individual_id', 'individual__individual_id', 'individual__family__family_id')
missing_samples_by_family = defaultdict(list)
for _, individual_id, family_id in previous_loaded_individuals:
Expand Down Expand Up @@ -279,22 +280,10 @@ def _parse_uploaded_pedigree(request_json, project=None):
errors.append('The following samples are included in the pedigree file but are missing from the VCF: {}'.format(
', '.join(missing_samples)))

records_by_family = defaultdict(list)
for record in pedigree_records:
records_by_family[record[JsonConstants.FAMILY_ID_COLUMN]].append(record)

no_affected_families = [
family_id for family_id, records in records_by_family.items()
if not any(record[JsonConstants.AFFECTED_COLUMN] == Individual.AFFECTED_STATUS_AFFECTED for record in records)
]

if no_affected_families:
errors.append('The following families do not have any affected individuals: {}'.format(', '.join(no_affected_families)))

if errors:
raise ErrorsWarningsException(errors, [])

return pedigree_records, records_by_family
return pedigree_records


def _trigger_add_workspace_data(project, pedigree_records, user, data_path, sample_type, previous_loaded_ids=None, get_pedigree_json=False):
Expand Down
14 changes: 10 additions & 4 deletions seqr/views/apis/anvil_workspace_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@

MISSING_REQUIRED_SAMPLE_DATA = [["21", "HG00736", "", "", "", "", "", "", "", ""]]

LOAD_SAMPLE_DATA_EXTRA_SAMPLE = LOAD_SAMPLE_DATA + [["1", "NA19678", "", "", "", "Male", "Affected", "HP:0011675", "", ""],
["22", "HG00736", "", "", "", "Unknown", "Unknown", "", "", ""]]
LOAD_SAMPLE_DATA_EXTRA_SAMPLE = LOAD_SAMPLE_DATA + [["1", "NA19678", "", "", "", "Male", "Affected", "HP:0011675", "", ""]]

LOAD_SAMPLE_DATA_NO_AFFECTED = LOAD_SAMPLE_DATA + [["22", "HG00736", "", "", "", "Unknown", "Unknown", "", "", ""]]

FILE_DATA = [
'##fileformat=VCFv4.2\n',
Expand Down Expand Up @@ -709,8 +710,13 @@ def _test_errors(self, url, fields, workspace_name):
self.assertEqual(response.status_code, 400)
response_json = response.json()
self.assertEqual(response_json['errors'],
['The following samples are included in the pedigree file but are missing from the VCF: NA19678, HG00736',
'The following families do not have any affected individuals: 22'])
['The following samples are included in the pedigree file but are missing from the VCF: NA19678'])

self.mock_load_file.return_value = LOAD_SAMPLE_DATA_NO_AFFECTED
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
self.assertEqual(response.status_code, 400)
response_json = response.json()
self.assertEqual(response_json['errors'],['The following families do not have any affected individuals: 22'])

def _assert_valid_operation(self, project, test_add_data=True):
genome_version = 'GRCh37' if test_add_data else 'GRCh38'
Expand Down
17 changes: 16 additions & 1 deletion seqr/views/apis/data_manager_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1508,9 +1508,11 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
self.assertEqual(response.status_code, 400)
self.assertDictEqual(response.json(), {'error': 'The following projects are invalid: R0005_not_project'})

body['projects'] = body['projects'][:-1]
self._test_no_affected_family(url, body)

self.reset_logs()
responses.calls.reset()
body['projects'] = body['projects'][:-1]
response = self._assert_expected_pm_access(
lambda: self.client.post(url, content_type='application/json', data=json.dumps(body))
)
Expand Down Expand Up @@ -1605,6 +1607,15 @@ def _test_load_single_project(self, mock_open, mock_mkdir, response, *args, **kw
# Only a DAG trigger, no airtable calls as there is no previously loaded WGS SNV_INDEL data for these samples
self.assertEqual(len(responses.calls), 1)

def _test_no_affected_family(self, url, body):
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertDictEqual(response.json(), {
'errors': ['The following families have no affected individuals and can not be loaded to seqr: F000005_5'],
'warnings': None,
})
Individual.objects.filter(guid='I000009_na20874').update(affected='A')


class LocalDataManagerAPITest(AuthenticationTestCase, DataManagerAPITest):
fixtures = ['users', '1kg_project', 'reference_data']
Expand Down Expand Up @@ -1907,3 +1918,7 @@ def _has_expected_ped_files(self, mock_open, mock_mkdir, dataset_type, *args, sa
def _assert_write_pedigree_error(self, response):
self.assertEqual(response.status_code, 200)
self.assertEqual(len(responses.calls), 1)

def _test_no_affected_family(self, url, body):
# Sample ID filtering skips the unaffected family
pass
99 changes: 41 additions & 58 deletions seqr/views/apis/family_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
from seqr.utils.gene_utils import get_genes_for_variant_display
from seqr.views.utils.file_utils import save_uploaded_file, load_uploaded_file
from seqr.views.utils.individual_utils import delete_individuals
from seqr.views.utils.json_to_orm_utils import update_family_from_json, update_model_from_json, \
get_or_create_model_from_json, create_model_from_json
from seqr.views.utils.json_to_orm_utils import update_family_from_json, update_model_from_json, create_model_from_json
from seqr.views.utils.json_utils import create_json_response
from seqr.views.utils.note_utils import create_note_handler, update_note_handler, delete_note_handler
from seqr.views.utils.orm_to_json_utils import _get_json_for_model, get_json_for_family_note, get_json_for_samples, \
Expand Down Expand Up @@ -162,49 +161,29 @@ def edit_families_handler(request, project_guid):
return create_json_response(
{}, status=400, reason="'families' not specified")

family_guids = [f['familyGuid'] for f in modified_families if f.get('familyGuid')]
family_models = {}
if family_guids:
family_models.update({f.guid: f for f in Family.objects.filter(project=project, guid__in=family_guids)})
if len(family_models) != len(family_guids):
missing_guids = set(family_guids) - set(family_models.keys())
return create_json_response({'error': 'Invalid family guids: {}'.format(', '.join(missing_guids))}, status=400)

updated_family_ids = {
fields[FAMILY_ID_FIELD]: family_models[fields['familyGuid']].family_id for fields in modified_families
if fields.get('familyGuid') and fields.get(FAMILY_ID_FIELD) and \
fields[FAMILY_ID_FIELD] != family_models[fields['familyGuid']].family_id}
existing_families = {
f.family_id for f in Family.objects.filter(project=project, family_id__in=updated_family_ids.keys())
}
if existing_families:
return create_json_response({
'error': 'Cannot update the following family ID(s) as they are already in use: {}'.format(', '.join([
'{} -> {}'.format(old_id, new_id) for new_id, old_id in updated_family_ids.items()
if new_id in existing_families
]))}, status=400)

no_guid_families = [f for f in modified_families if not f.get('familyGuid')]
if no_guid_families:
prev_ids = [f[PREVIOUS_FAMILY_ID_FIELD] for f in no_guid_families if f.get(PREVIOUS_FAMILY_ID_FIELD)]
prev_id_models = {f.family_id: f for f in Family.objects.filter(project=project, family_id__in=prev_ids)}
if len(prev_id_models) != len(prev_ids):
missing_ids = set(prev_ids) - set(prev_id_models.keys())
return create_json_response(
{'error': 'Invalid previous family ids: {}'.format(', '.join(missing_ids))}, status=400)
family_models.update(prev_id_models)
family_guids = [f.get('familyGuid') for f in modified_families]
family_models = {f.guid: f for f in Family.objects.filter(project=project, guid__in=family_guids)}
if len(family_models) != len(family_guids):
missing_guids = set(family_guids) - set(family_models.keys())
return create_json_response({'error': 'Invalid family guids: {}'.format(', '.join(missing_guids))}, status=400)

updated_family_ids = {
fields[FAMILY_ID_FIELD]: family_models[fields['familyGuid']].family_id for fields in modified_families
if fields.get('familyGuid') and fields.get(FAMILY_ID_FIELD) and \
fields[FAMILY_ID_FIELD] != family_models[fields['familyGuid']].family_id}
existing_families = {
f.family_id for f in Family.objects.filter(project=project, family_id__in=updated_family_ids.keys())
}
if existing_families:
return create_json_response({
'error': 'Cannot update the following family ID(s) as they are already in use: {}'.format(', '.join([
'{} -> {}'.format(old_id, new_id) for new_id, old_id in updated_family_ids.items()
if new_id in existing_families
]))}, status=400)

updated_family_ids = []
for fields in modified_families:
if fields.get('familyGuid'):
family = family_models[fields['familyGuid']]
elif fields.get(PREVIOUS_FAMILY_ID_FIELD):
family = family_models[fields[PREVIOUS_FAMILY_ID_FIELD]]
else:
family, _ = get_or_create_model_from_json(
Family, {'project': project, 'family_id': fields[FAMILY_ID_FIELD]},
update_json=None, user=request.user)

family = family_models[fields['familyGuid']]
update_family_from_json(family, fields, user=request.user, allow_unknown_keys=True)
updated_family_ids.append(family.id)

Expand Down Expand Up @@ -433,33 +412,37 @@ def _process_records(records, filename=''):
if FAMILY_ID_FIELD not in column_map:
raise ValueError('Invalid header, missing family id column')

return [{column: PARSE_FAMILY_TABLE_FIELDS.get(column, lambda v: v)(row[index])
parsed_records = [{column: PARSE_FAMILY_TABLE_FIELDS.get(column, lambda v: v)(row[index])
for column, index in column_map.items()} for row in records[1:]]
family_ids = [r.get(PREVIOUS_FAMILY_ID_FIELD) or r[FAMILY_ID_FIELD] for r in parsed_records]
family_guid_map = dict(
Family.objects.filter(family_id__in=family_ids, project=project).values_list('family_id', 'guid')
)
return [{
'familyGuid': family_guid_map.get(r.get(PREVIOUS_FAMILY_ID_FIELD) or r[FAMILY_ID_FIELD]),
**r,
} for r in parsed_records]

try:
uploaded_file_id, filename, json_records = save_uploaded_file(request, process_records=_process_records)
except Exception as e:
return create_json_response({'errors': [str(e)], 'warnings': []}, status=400, reason=str(e))

prev_fam_ids = {r[PREVIOUS_FAMILY_ID_FIELD] for r in json_records if r.get(PREVIOUS_FAMILY_ID_FIELD)}
existing_prev_fam_ids = {f.family_id for f in Family.objects.filter(family_id__in=prev_fam_ids, project=project).only('family_id')}
if len(prev_fam_ids) != len(existing_prev_fam_ids):
missing_prev_ids = [family_id for family_id in prev_fam_ids if family_id not in existing_prev_fam_ids]
missing_guid_records = [r for r in json_records if not r['familyGuid']]
if missing_guid_records:
missing_prev_ids = [r[PREVIOUS_FAMILY_ID_FIELD] for r in missing_guid_records if r.get(PREVIOUS_FAMILY_ID_FIELD)]
missing_curr_ids = [r[FAMILY_ID_FIELD] for r in missing_guid_records if not r.get(PREVIOUS_FAMILY_ID_FIELD)]
errors = []
if missing_prev_ids:
errors.append('Could not find families with the following previous IDs: {}'.format(', '.join(missing_prev_ids)))
if missing_curr_ids:
errors.append('Could not find families with the following current IDs: {}'.format(', '.join(missing_curr_ids)))
return create_json_response(
{'errors': [
'Could not find families with the following previous IDs: {}'.format(', '.join(missing_prev_ids))
], 'warnings': []},
{'errors': errors, 'warnings': []},
status=400, reason='Invalid input')

fam_ids = {r[FAMILY_ID_FIELD] for r in json_records if not r.get(PREVIOUS_FAMILY_ID_FIELD)}
num_families_to_update = len(prev_fam_ids) + Family.objects.filter(family_id__in=fam_ids, project=project).count()

num_families = len(json_records)
num_families_to_create = num_families - num_families_to_update

info = [
"{num_families} families parsed from {filename}".format(num_families=num_families, filename=filename),
"{} new families will be added, {} existing families will be updated".format(num_families_to_create, num_families_to_update),
f"{len(json_records)} exisitng families parsed from {filename}",
]

return create_json_response({
Expand Down
17 changes: 8 additions & 9 deletions seqr/views/apis/family_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,7 @@ def test_edit_families_handler(self, mock_pm_group):
req_values = {
'families': [
{'familyGuid': FAMILY_GUID, 'description': 'Test description 1'},
{PREVIOUS_FAMILY_ID_FIELD: '2', FAMILY_ID_FIELD: '22', 'description': 'Test description 2'},
{FAMILY_ID_FIELD: 'new_family', 'description': 'Test descriptions for a new family'}
{'familyGuid': FAMILY_GUID2, PREVIOUS_FAMILY_ID_FIELD: '2', FAMILY_ID_FIELD: '22', 'description': 'Test description 2'},
]
}
response = self.client.post(url, content_type='application/json',
Expand All @@ -243,9 +242,7 @@ def test_edit_families_handler(self, mock_pm_group):
self.assertEqual(response_json['familiesByGuid'][FAMILY_GUID]['description'], 'Test description 1')
self.assertEqual(response_json['familiesByGuid']['F000002_2'][FAMILY_ID_FIELD], '22')
self.assertEqual(response_json['familiesByGuid']['F000002_2']['description'], 'Test description 2')
new_guids = set(response_json['familiesByGuid'].keys()) - set([FAMILY_GUID, 'F000002_2'])
new_guid = new_guids.pop()
self.assertEqual(response_json['familiesByGuid'][new_guid]['description'], 'Test descriptions for a new family')
self.assertSetEqual(set(response_json['familiesByGuid'].keys()), set([FAMILY_GUID, 'F000002_2']))

# Test PM permission
url = reverse(edit_families_handler, args=[PM_REQUIRED_PROJECT_GUID])
Expand Down Expand Up @@ -474,12 +471,13 @@ def test_receive_families_table_handler(self, mock_pm_group):

data = b'Family ID Previous Family ID Display Name Description Coded Phenotype\n\
"1_renamed" "1_old" "1" "family one description" ""\n\
"2" "" "2" "family two description" ""'
"22" "" "2" "family two description" ""'
response = self.client.post(url, {'f': SimpleUploadedFile("1000_genomes demo_families.tsv", data)})
self.assertEqual(response.status_code, 400)
self.assertEqual(response.reason_phrase, 'Invalid input')
self.assertDictEqual(response.json(), {
'errors': ['Could not find families with the following previous IDs: 1_old'], 'warnings': []})
'errors': ['Could not find families with the following previous IDs: 1_old',
'Could not find families with the following current IDs: 22'], 'warnings': []})

# send valid request
data = b'Family ID Previous Family ID Display Name Description Phenotype Description MONDO ID\n\
Expand Down Expand Up @@ -511,8 +509,8 @@ def test_receive_families_table_handler(self, mock_pm_group):
self.assertEqual(family_2['familyId'], '2')

internal_field_data = b'Family ID External Data\n\
"11" ""\n\
"12" "ONT lrGS; BioNano"'
"3" ""\n\
"2" "ONT lrGS; BioNano"'
response = self.client.post(url, {'f': SimpleUploadedFile('families.tsv', internal_field_data)})
self.assertEqual(response.status_code, 200)
response = self.client.post(
Expand All @@ -532,6 +530,7 @@ def test_receive_families_table_handler(self, mock_pm_group):
mock_pm_group.resolve_expression.return_value = 'project-managers'
mock_pm_group.__eq__.side_effect = lambda s: s == 'project-managers'

internal_field_data = internal_field_data.replace(b'3', b'11').replace(b'2', b'12')
response = self.client.post(url, {'f': SimpleUploadedFile('families.tsv', internal_field_data)})
self.assertEqual(response.status_code, 200)
response = self.client.post(
Expand Down
Loading

0 comments on commit dbef705

Please sign in to comment.