Merge pull request #4487 from broadinstitute/gregor-airtable-warnings

Better gregor missing airtable warnings
broadinstitute · Nov 19, 2024 · 51e0ec0 · 51e0ec0
2 parents b81fd5f + a2a327b
commit 51e0ec0
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 15 deletions.
diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
@@ -416,21 +416,57 @@ def _add_row(row, family_id, row_type):
     airtable_rows = {table: [] for table in AIRTABLE_TABLE_COLUMNS.keys()}
     experiment_lookup_rows = []
     experiment_ids_by_participant = {}
+    missing_participant_ids = []
+    missing_airtable = []
+    missing_airtable_data_types = defaultdict(list)
+    missing_seqr_data_types = defaultdict(list)
     for participant in participant_rows:
         phenotype_rows += _parse_participant_phenotype_rows(participant)
         analyte = {k: participant.pop(k) for k in [SMID_FIELD, *ANALYTE_TABLE_COLUMNS[2:]]}
         analyte['participant_id'] = participant['participant_id']
 
         if not participant[PARTICIPANT_ID_FIELD]:
+            missing_participant_ids.append(participant['participant_id'])
             continue
 
-        airtable_metadata = airtable_metadata_by_participant.get(participant.pop(PARTICIPANT_ID_FIELD)) or {}
-        data_types = grouped_data_type_individuals[participant['participant_id']]
+        airtable_participant_id = participant.pop(PARTICIPANT_ID_FIELD)
+        airtable_metadata = airtable_metadata_by_participant.get(airtable_participant_id)
+        if not airtable_metadata:
+            missing_airtable.append(airtable_participant_id)
+            continue
+
+        seqr_data_types = set(grouped_data_type_individuals[participant['participant_id']].keys())
+        airtable_data_types = {dt.upper() for dt in GREGOR_DATA_TYPES if dt.upper() in airtable_metadata}
+        for data_type in seqr_data_types - airtable_data_types:
+            missing_airtable_data_types[data_type].append(airtable_participant_id)
+        for data_type in airtable_data_types - seqr_data_types:
+            missing_seqr_data_types[data_type].append(airtable_participant_id)
         _parse_participant_airtable_rows(
-            analyte, airtable_metadata, data_types, experiment_ids_by_participant,
+            analyte, airtable_metadata, seqr_data_types.intersection(airtable_data_types), experiment_ids_by_participant,
             analyte_rows, airtable_rows, experiment_lookup_rows,
         )
 
+    errors = []
+    if missing_participant_ids:
+        errors.append(
+            f'The following participants are missing {PARTICIPANT_ID_FIELD} for the airtable Sample: '
+            f'{", ".join(sorted(missing_participant_ids))}'
+        )
+    if missing_airtable:
+        errors.append(
+            f'The following entries are missing airtable metadata: '
+            f'{", ".join(sorted(missing_airtable))}'
+        )
+    warnings = [
+        f'The following entries are missing {data_type} airtable data: {", ".join(participants)}'
+        for data_type, participants in sorted(missing_airtable_data_types.items())
+    ]
+    warnings += [
+        f'The following entries have {data_type} airtable data but do not have equivalent loaded data in seqr, so airtable data is omitted: '
+        f'{", ".join(sorted(participants))}'
+        for data_type, participants in sorted(missing_seqr_data_types.items())
+    ]
+
     # Add experiment IDs
     for variant in genetic_findings_rows:
         variant['experiment_id'] = experiment_ids_by_participant.get(variant['participant_id'])
@@ -445,7 +481,7 @@ def _add_row(row, family_id, row_type):
         (FINDINGS_TABLE, genetic_findings_rows),
     ]
 
-    files, warnings, errors = _populate_gregor_files(file_data)
+    files = _populate_gregor_files(file_data, errors, warnings)
 
     if errors and not request_json.get('overrideValidation'):
         raise ErrorsWarningsException(errors, warnings)
@@ -494,8 +530,6 @@ def _parse_participant_airtable_rows(analyte, airtable_metadata, data_types, exp
     smids = analyte.pop(SMID_FIELD)
     # airtable data
     for data_type in data_types:
-        if data_type not in airtable_metadata:
-            continue
         is_rna, row = _get_airtable_row(data_type, airtable_metadata)
         smids = None
         analyte_rows.append({**analyte, **{k: row[k] for k in ANALYTE_TABLE_COLUMNS if k in row}})
@@ -658,9 +692,7 @@ def _validate_enumeration(val, validator):
 DATA_TYPE_FORMATTERS['float'] = DATA_TYPE_FORMATTERS['integer']
 
 
-def _populate_gregor_files(file_data):
-    errors = []
-    warnings = []
+def _populate_gregor_files(file_data, errors, warnings):
     try:
         table_configs, required_tables = _load_data_model_validators()
     except Exception as e:
@@ -715,7 +747,7 @@ def _populate_gregor_files(file_data):
         for column, config in table_config.items():
             _validate_column_data(column, file_name, data, column_validator=config, warnings=warnings, errors=errors)
 
-    return files, warnings, errors
+    return files
 
 
 def _load_data_model_validators():

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
@@ -850,6 +850,9 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
         self.assertEqual(response.status_code, 400)
 
         recommended_warnings = [
+            'The following entries are missing RNA airtable data: NA19675',
+            'The following entries are missing WES airtable data: NA19675, NA19679',
+            'The following entries have WGS airtable data but do not have equivalent loaded data in seqr, so airtable data is omitted: NA19675, NA20888, VCGS_FAM203_621',
             'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',
             'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
@@ -862,9 +865,11 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
             'The following columns are computed for the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, missing_variant_case, pmid_id',
         ] + recommended_warnings
         self.assertListEqual(response.json()['warnings'], validation_warnings)
+        missing_participant_error = 'The following participants are missing CollaboratorParticipantID for the airtable Sample: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881'
         validation_errors = [
             f'No data model found for "{file}" table' for file in reversed(EXPECTED_GREGOR_FILES) if file not in INVALID_MODEL_TABLES
         ] + [
+            missing_participant_error,
             'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set',
         ] + [
             'The following entries are missing required "prior_testing" in the "participant" table: Broad_HG00731, Broad_HG00732',
@@ -939,9 +944,10 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
         mock_subprocess.reset_mock()
         mock_open.reset_mock()
         responses.add(responses.GET, MOCK_DATA_MODEL_URL, body=MOCK_DATA_MODEL_RESPONSE, status=200)
+        body['overrideValidation'] = True
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 200)
-        expected_response['warnings'] = recommended_warnings
+        expected_response['warnings'] = [missing_participant_error] + recommended_warnings
         self.assertDictEqual(response.json(), expected_response)
         self._assert_expected_gregor_files(mock_open, mock_subprocess)
         self._test_expected_gregor_airtable_calls()
@@ -972,10 +978,12 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 200)
         expected_response['info'][0] = expected_response['info'][0].replace('9', '10')
-        expected_response['warnings'][0] = expected_response['warnings'][0] + ', Broad_NA20885, Broad_NA20889'
-        expected_response['warnings'][1] = expected_response['warnings'][1].replace(', Broad_NA20888', '')
-        expected_response['warnings'][2] = expected_response['warnings'][2].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
-        expected_response['warnings'][3] = expected_response['warnings'][3].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
+        expected_response['warnings'][0] = expected_response['warnings'][0].replace('Broad_NA20881', 'Broad_NA20881, Broad_NA20885, Broad_NA20889')
+        expected_response['warnings'][3] = expected_response['warnings'][3].replace(', NA20888', '')
+        expected_response['warnings'][4] = expected_response['warnings'][4] + ', Broad_NA20885, Broad_NA20889'
+        expected_response['warnings'][5] = expected_response['warnings'][5].replace(', Broad_NA20888', '')
+        expected_response['warnings'][6] = expected_response['warnings'][6].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
+        expected_response['warnings'][7] = expected_response['warnings'][7].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
         self.assertDictEqual(response.json(), expected_response)
         self._assert_expected_gregor_files(mock_open, mock_subprocess, has_second_project=True)
         self._test_expected_gregor_airtable_calls(additional_samples=['NA20885', 'NA20889'], additional_mondo_ids=['0008788'])