Merge branch 'dev' of https://github.com/broadinstitute/seqr into dev

broadinstitute · Dec 4, 2024 · dbef705 · dbef705
2 parents f46b042 + 04d8860
commit dbef705
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 100 deletions.
diff --git a/seqr/utils/search/add_data_utils.py b/seqr/utils/search/add_data_utils.py
@@ -6,11 +6,13 @@
 from seqr.models import Sample, Individual, Project
 from seqr.utils.communication_utils import send_project_notification, safe_post_to_slack
 from seqr.utils.logging_utils import SeqrLogger
+from seqr.utils.middleware import ErrorsWarningsException
 from seqr.utils.search.utils import backend_specific_call
 from seqr.utils.search.elasticsearch.es_utils import validate_es_index_metadata_and_get_samples
 from seqr.views.utils.airtable_utils import AirtableSession, ANVIL_REQUEST_TRACKING_TABLE
 from seqr.views.utils.dataset_utils import match_and_update_search_samples, load_mapping_file
 from seqr.views.utils.export_utils import write_multiple_files
+from seqr.views.utils.pedigree_info_utils import get_no_affected_families
 from settings import SEQR_SLACK_DATA_ALERTS_NOTIFICATION_CHANNEL, BASE_URL, ANVIL_UI_URL, \
     SEQR_SLACK_ANVIL_DATA_LOADING_CHANNEL
 
@@ -144,14 +146,23 @@ def _upload_data_loading_files(projects: list[Project], user: User, file_path: s
         'Individual_ID': F('individual_id'),
         'Paternal_ID': F('father__individual_id'), 'Maternal_ID': F('mother__individual_id'), 'Sex': F('sex'),
     })
-    annotations = {'project': F('family__project__guid'), **file_annotations}
+    annotations = {'project': F('family__project__guid'), 'affected_status': F('affected'), **file_annotations}
     individual_filter = {'id__in': individual_ids} if individual_ids else {'family__project__in': projects}
     data = Individual.objects.filter(**individual_filter).order_by('family_id', 'individual_id').values(
         **dict(annotations))
 
     data_by_project = defaultdict(list)
+    affected_by_family = defaultdict(list)
     for row in data:
         data_by_project[row.pop('project')].append(row)
+        affected_by_family[row['Family_GUID']].append(row.pop('affected_status'))
+
+    no_affected_families =get_no_affected_families(affected_by_family)
+    if no_affected_families:
+        families = ', '.join(sorted(no_affected_families))
+        raise ErrorsWarningsException(errors=[
+            f'The following families have no affected individuals and can not be loaded to seqr: {families}',
+        ])
 
     header = list(file_annotations.keys())
     files = [(f'{project_guid}_pedigree', header, rows) for project_guid, rows in data_by_project.items()]

diff --git a/seqr/views/apis/anvil_workspace_api.py b/seqr/views/apis/anvil_workspace_api.py
@@ -188,7 +188,7 @@ def create_project_from_workspace(request, namespace, name):
         error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
         return create_json_response({'error': error}, status=400, reason=error)
 
-    pedigree_records, _ = _parse_uploaded_pedigree(request_json)
+    pedigree_records = _parse_uploaded_pedigree(request_json)
 
     # Create a new Project in seqr
     project_args = {
@@ -229,7 +229,7 @@ def add_workspace_data(request, project_guid):
         error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
         return create_json_response({'error': error}, status=400, reason=error)
 
-    pedigree_records, records_by_family = _parse_uploaded_pedigree(request_json, project=project)
+    pedigree_records = _parse_uploaded_pedigree(request_json, project=project)
 
     previous_samples = get_search_samples([project]).filter(dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS)
     sample = previous_samples.first()
@@ -239,8 +239,9 @@ def add_workspace_data(request, project_guid):
         }, status=400)
     sample_type = sample.sample_type
 
+    families = {record[JsonConstants.FAMILY_ID_COLUMN] for record in pedigree_records}
     previous_loaded_individuals = previous_samples.filter(
-        individual__family__family_id__in=records_by_family,
+        individual__family__family_id__in=families,
     ).values_list('individual_id', 'individual__individual_id', 'individual__family__family_id')
     missing_samples_by_family = defaultdict(list)
     for _, individual_id, family_id in previous_loaded_individuals:
@@ -279,22 +280,10 @@ def _parse_uploaded_pedigree(request_json, project=None):
         errors.append('The following samples are included in the pedigree file but are missing from the VCF: {}'.format(
                 ', '.join(missing_samples)))
 
-    records_by_family = defaultdict(list)
-    for record in pedigree_records:
-        records_by_family[record[JsonConstants.FAMILY_ID_COLUMN]].append(record)
-
-    no_affected_families = [
-        family_id for family_id, records in records_by_family.items()
-        if not any(record[JsonConstants.AFFECTED_COLUMN] == Individual.AFFECTED_STATUS_AFFECTED for record in records)
-    ]
-
-    if no_affected_families:
-        errors.append('The following families do not have any affected individuals: {}'.format(', '.join(no_affected_families)))
-
     if errors:
         raise ErrorsWarningsException(errors, [])
 
-    return pedigree_records, records_by_family
+    return pedigree_records
 
 
 def _trigger_add_workspace_data(project, pedigree_records, user, data_path, sample_type, previous_loaded_ids=None, get_pedigree_json=False):

diff --git a/seqr/views/apis/anvil_workspace_api_tests.py b/seqr/views/apis/anvil_workspace_api_tests.py
@@ -26,8 +26,9 @@
 
 MISSING_REQUIRED_SAMPLE_DATA = [["21", "HG00736", "", "", "", "", "", "", "", ""]]
 
-LOAD_SAMPLE_DATA_EXTRA_SAMPLE = LOAD_SAMPLE_DATA + [["1", "NA19678", "", "", "", "Male", "Affected", "HP:0011675", "", ""],
-                                                    ["22", "HG00736", "", "", "", "Unknown", "Unknown", "", "", ""]]
+LOAD_SAMPLE_DATA_EXTRA_SAMPLE = LOAD_SAMPLE_DATA + [["1", "NA19678", "", "", "", "Male", "Affected", "HP:0011675", "", ""]]
+
+LOAD_SAMPLE_DATA_NO_AFFECTED = LOAD_SAMPLE_DATA + [["22", "HG00736", "", "", "", "Unknown", "Unknown", "", "", ""]]
 
 FILE_DATA = [
     '##fileformat=VCFv4.2\n',
@@ -709,8 +710,13 @@ def _test_errors(self, url, fields, workspace_name):
         self.assertEqual(response.status_code, 400)
         response_json = response.json()
         self.assertEqual(response_json['errors'],
-                         ['The following samples are included in the pedigree file but are missing from the VCF: NA19678, HG00736',
-                          'The following families do not have any affected individuals: 22'])
+                         ['The following samples are included in the pedigree file but are missing from the VCF: NA19678'])
+
+        self.mock_load_file.return_value = LOAD_SAMPLE_DATA_NO_AFFECTED
+        response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
+        self.assertEqual(response.status_code, 400)
+        response_json = response.json()
+        self.assertEqual(response_json['errors'],['The following families do not have any affected individuals: 22'])
 
     def _assert_valid_operation(self, project, test_add_data=True):
         genome_version = 'GRCh37' if test_add_data else 'GRCh38'

diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
@@ -1508,9 +1508,11 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
         self.assertEqual(response.status_code, 400)
         self.assertDictEqual(response.json(), {'error': 'The following projects are invalid: R0005_not_project'})
 
+        body['projects'] = body['projects'][:-1]
+        self._test_no_affected_family(url, body)
+
         self.reset_logs()
         responses.calls.reset()
-        body['projects'] = body['projects'][:-1]
         response = self._assert_expected_pm_access(
             lambda: self.client.post(url, content_type='application/json', data=json.dumps(body))
         )
@@ -1605,6 +1607,15 @@ def _test_load_single_project(self, mock_open, mock_mkdir, response, *args, **kw
         # Only a DAG trigger, no airtable calls as there is no previously loaded WGS SNV_INDEL data for these samples
         self.assertEqual(len(responses.calls), 1)
 
+    def _test_no_affected_family(self, url, body):
+        response = self.client.post(url, content_type='application/json', data=json.dumps(body))
+        self.assertEqual(response.status_code, 400)
+        self.assertDictEqual(response.json(), {
+            'errors': ['The following families have no affected individuals and can not be loaded to seqr: F000005_5'],
+            'warnings': None,
+        })
+        Individual.objects.filter(guid='I000009_na20874').update(affected='A')
+
 
 class LocalDataManagerAPITest(AuthenticationTestCase, DataManagerAPITest):
     fixtures = ['users', '1kg_project', 'reference_data']
@@ -1907,3 +1918,7 @@ def _has_expected_ped_files(self, mock_open, mock_mkdir, dataset_type, *args, sa
     def _assert_write_pedigree_error(self, response):
         self.assertEqual(response.status_code, 200)
         self.assertEqual(len(responses.calls), 1)
+
+    def _test_no_affected_family(self, url, body):
+        # Sample ID filtering skips the unaffected family
+        pass
diff --git a/seqr/views/apis/family_api.py b/seqr/views/apis/family_api.py
@@ -14,8 +14,7 @@
 from seqr.utils.gene_utils import get_genes_for_variant_display
 from seqr.views.utils.file_utils import save_uploaded_file, load_uploaded_file
 from seqr.views.utils.individual_utils import delete_individuals
-from seqr.views.utils.json_to_orm_utils import update_family_from_json, update_model_from_json, \
-    get_or_create_model_from_json, create_model_from_json
+from seqr.views.utils.json_to_orm_utils import update_family_from_json, update_model_from_json, create_model_from_json
 from seqr.views.utils.json_utils import create_json_response
 from seqr.views.utils.note_utils import create_note_handler, update_note_handler, delete_note_handler
 from seqr.views.utils.orm_to_json_utils import _get_json_for_model,  get_json_for_family_note, get_json_for_samples, \
@@ -162,49 +161,29 @@ def edit_families_handler(request, project_guid):
         return create_json_response(
             {}, status=400, reason="'families' not specified")
 
-    family_guids = [f['familyGuid'] for f in modified_families if f.get('familyGuid')]
-    family_models = {}
-    if family_guids:
-        family_models.update({f.guid: f for f in Family.objects.filter(project=project, guid__in=family_guids)})
-        if len(family_models) != len(family_guids):
-            missing_guids = set(family_guids) - set(family_models.keys())
-            return create_json_response({'error': 'Invalid family guids: {}'.format(', '.join(missing_guids))}, status=400)
-
-        updated_family_ids = {
-            fields[FAMILY_ID_FIELD]: family_models[fields['familyGuid']].family_id for fields in modified_families
-            if fields.get('familyGuid') and fields.get(FAMILY_ID_FIELD) and \
-                fields[FAMILY_ID_FIELD] != family_models[fields['familyGuid']].family_id}
-        existing_families = {
-            f.family_id for f in Family.objects.filter(project=project, family_id__in=updated_family_ids.keys())
-        }
-        if existing_families:
-            return create_json_response({
-                'error': 'Cannot update the following family ID(s) as they are already in use: {}'.format(', '.join([
-                    '{} -> {}'.format(old_id, new_id) for new_id, old_id in updated_family_ids.items()
-                    if new_id in existing_families
-                ]))}, status=400)
-
-    no_guid_families = [f for f in modified_families if not f.get('familyGuid')]
-    if no_guid_families:
-        prev_ids = [f[PREVIOUS_FAMILY_ID_FIELD] for f in no_guid_families if f.get(PREVIOUS_FAMILY_ID_FIELD)]
-        prev_id_models = {f.family_id: f for f in Family.objects.filter(project=project, family_id__in=prev_ids)}
-        if len(prev_id_models) != len(prev_ids):
-            missing_ids = set(prev_ids) - set(prev_id_models.keys())
-            return create_json_response(
-                {'error': 'Invalid previous family ids: {}'.format(', '.join(missing_ids))}, status=400)
-        family_models.update(prev_id_models)
+    family_guids = [f.get('familyGuid') for f in modified_families]
+    family_models = {f.guid: f for f in Family.objects.filter(project=project, guid__in=family_guids)}
+    if len(family_models) != len(family_guids):
+        missing_guids = set(family_guids) - set(family_models.keys())
+        return create_json_response({'error': 'Invalid family guids: {}'.format(', '.join(missing_guids))}, status=400)
+
+    updated_family_ids = {
+        fields[FAMILY_ID_FIELD]: family_models[fields['familyGuid']].family_id for fields in modified_families
+        if fields.get('familyGuid') and fields.get(FAMILY_ID_FIELD) and \
+            fields[FAMILY_ID_FIELD] != family_models[fields['familyGuid']].family_id}
+    existing_families = {
+        f.family_id for f in Family.objects.filter(project=project, family_id__in=updated_family_ids.keys())
+    }
+    if existing_families:
+        return create_json_response({
+            'error': 'Cannot update the following family ID(s) as they are already in use: {}'.format(', '.join([
+                '{} -> {}'.format(old_id, new_id) for new_id, old_id in updated_family_ids.items()
+                if new_id in existing_families
+            ]))}, status=400)
 
     updated_family_ids = []
     for fields in modified_families:
-        if fields.get('familyGuid'):
-            family = family_models[fields['familyGuid']]
-        elif fields.get(PREVIOUS_FAMILY_ID_FIELD):
-            family = family_models[fields[PREVIOUS_FAMILY_ID_FIELD]]
-        else:
-            family, _ = get_or_create_model_from_json(
-                Family, {'project': project, 'family_id': fields[FAMILY_ID_FIELD]},
-                update_json=None, user=request.user)
-
+        family = family_models[fields['familyGuid']]
         update_family_from_json(family, fields, user=request.user, allow_unknown_keys=True)
         updated_family_ids.append(family.id)
 
@@ -433,33 +412,37 @@ def _process_records(records, filename=''):
         if FAMILY_ID_FIELD not in column_map:
             raise ValueError('Invalid header, missing family id column')
 
-        return [{column: PARSE_FAMILY_TABLE_FIELDS.get(column, lambda v: v)(row[index])
+        parsed_records = [{column: PARSE_FAMILY_TABLE_FIELDS.get(column, lambda v: v)(row[index])
                 for column, index in column_map.items()} for row in records[1:]]
+        family_ids = [r.get(PREVIOUS_FAMILY_ID_FIELD) or r[FAMILY_ID_FIELD] for r in parsed_records]
+        family_guid_map = dict(
+            Family.objects.filter(family_id__in=family_ids, project=project).values_list('family_id', 'guid')
+        )
+        return [{
+            'familyGuid': family_guid_map.get(r.get(PREVIOUS_FAMILY_ID_FIELD) or r[FAMILY_ID_FIELD]),
+            **r,
+        } for r in parsed_records]
 
     try:
         uploaded_file_id, filename, json_records = save_uploaded_file(request, process_records=_process_records)
     except Exception as e:
         return create_json_response({'errors': [str(e)], 'warnings': []}, status=400, reason=str(e))
 
-    prev_fam_ids = {r[PREVIOUS_FAMILY_ID_FIELD] for r in json_records if r.get(PREVIOUS_FAMILY_ID_FIELD)}
-    existing_prev_fam_ids = {f.family_id for f in Family.objects.filter(family_id__in=prev_fam_ids, project=project).only('family_id')}
-    if len(prev_fam_ids) != len(existing_prev_fam_ids):
-        missing_prev_ids = [family_id for family_id in prev_fam_ids if family_id not in existing_prev_fam_ids]
+    missing_guid_records = [r for r in json_records if not r['familyGuid']]
+    if missing_guid_records:
+        missing_prev_ids = [r[PREVIOUS_FAMILY_ID_FIELD] for r in missing_guid_records if r.get(PREVIOUS_FAMILY_ID_FIELD)]
+        missing_curr_ids = [r[FAMILY_ID_FIELD] for r in missing_guid_records if not r.get(PREVIOUS_FAMILY_ID_FIELD)]
+        errors = []
+        if missing_prev_ids:
+            errors.append('Could not find families with the following previous IDs: {}'.format(', '.join(missing_prev_ids)))
+        if missing_curr_ids:
+            errors.append('Could not find families with the following current IDs: {}'.format(', '.join(missing_curr_ids)))
         return create_json_response(
-            {'errors': [
-                'Could not find families with the following previous IDs: {}'.format(', '.join(missing_prev_ids))
-            ], 'warnings': []},
+            {'errors': errors, 'warnings': []},
             status=400, reason='Invalid input')
 
-    fam_ids = {r[FAMILY_ID_FIELD] for r in json_records if not r.get(PREVIOUS_FAMILY_ID_FIELD)}
-    num_families_to_update = len(prev_fam_ids) + Family.objects.filter(family_id__in=fam_ids, project=project).count()
-
-    num_families = len(json_records)
-    num_families_to_create = num_families - num_families_to_update
-
     info = [
-        "{num_families} families parsed from {filename}".format(num_families=num_families, filename=filename),
-        "{} new families will be added, {} existing families will be updated".format(num_families_to_create, num_families_to_update),
+       f"{len(json_records)} exisitng families parsed from {filename}",
     ]
 
     return create_json_response({

diff --git a/seqr/views/apis/family_api_tests.py b/seqr/views/apis/family_api_tests.py
@@ -230,8 +230,7 @@ def test_edit_families_handler(self, mock_pm_group):
         req_values = {
             'families': [
                 {'familyGuid': FAMILY_GUID, 'description': 'Test description 1'},
-                {PREVIOUS_FAMILY_ID_FIELD: '2', FAMILY_ID_FIELD: '22', 'description': 'Test description 2'},
-                {FAMILY_ID_FIELD: 'new_family', 'description': 'Test descriptions for a new family'}
+                {'familyGuid': FAMILY_GUID2, PREVIOUS_FAMILY_ID_FIELD: '2', FAMILY_ID_FIELD: '22', 'description': 'Test description 2'},
             ]
         }
         response = self.client.post(url, content_type='application/json',
@@ -243,9 +242,7 @@ def test_edit_families_handler(self, mock_pm_group):
         self.assertEqual(response_json['familiesByGuid'][FAMILY_GUID]['description'], 'Test description 1')
         self.assertEqual(response_json['familiesByGuid']['F000002_2'][FAMILY_ID_FIELD], '22')
         self.assertEqual(response_json['familiesByGuid']['F000002_2']['description'], 'Test description 2')
-        new_guids = set(response_json['familiesByGuid'].keys()) - set([FAMILY_GUID, 'F000002_2'])
-        new_guid = new_guids.pop()
-        self.assertEqual(response_json['familiesByGuid'][new_guid]['description'], 'Test descriptions for a new family')
+        self.assertSetEqual(set(response_json['familiesByGuid'].keys()), set([FAMILY_GUID, 'F000002_2']))
 
         # Test PM permission
         url = reverse(edit_families_handler, args=[PM_REQUIRED_PROJECT_GUID])
@@ -474,12 +471,13 @@ def test_receive_families_table_handler(self, mock_pm_group):
 
         data = b'Family ID	Previous Family ID	Display Name	Description	Coded Phenotype\n\
         "1_renamed"	"1_old"	"1"	"family one description"	""\n\
-        "2"	""	"2"	"family two description"	""'
+        "22"	""	"2"	"family two description"	""'
         response = self.client.post(url, {'f': SimpleUploadedFile("1000_genomes demo_families.tsv", data)})
         self.assertEqual(response.status_code, 400)
         self.assertEqual(response.reason_phrase, 'Invalid input')
         self.assertDictEqual(response.json(), {
-            'errors': ['Could not find families with the following previous IDs: 1_old'], 'warnings': []})
+            'errors': ['Could not find families with the following previous IDs: 1_old',
+                       'Could not find families with the following current IDs: 22'], 'warnings': []})
 
         # send valid request
         data = b'Family ID	Previous Family ID	Display Name	Description	Phenotype Description	MONDO ID\n\
@@ -511,8 +509,8 @@ def test_receive_families_table_handler(self, mock_pm_group):
         self.assertEqual(family_2['familyId'], '2')
 
         internal_field_data = b'Family ID	External Data\n\
-"11"	""\n\
-"12"	"ONT lrGS; BioNano"'
+"3"	""\n\
+"2"	"ONT lrGS; BioNano"'
         response = self.client.post(url,  {'f': SimpleUploadedFile('families.tsv', internal_field_data)})
         self.assertEqual(response.status_code, 200)
         response = self.client.post(
@@ -532,6 +530,7 @@ def test_receive_families_table_handler(self, mock_pm_group):
         mock_pm_group.resolve_expression.return_value = 'project-managers'
         mock_pm_group.__eq__.side_effect = lambda s: s == 'project-managers'
 
+        internal_field_data = internal_field_data.replace(b'3', b'11').replace(b'2', b'12')
         response = self.client.post(url,  {'f': SimpleUploadedFile('families.tsv', internal_field_data)})
         self.assertEqual(response.status_code, 200)
         response = self.client.post(