Skip to content

Commit

Permalink
Merge pull request #4487 from broadinstitute/gregor-airtable-warnings
Browse files Browse the repository at this point in the history
Better gregor missing airtable warnings
  • Loading branch information
hanars authored Nov 19, 2024
2 parents b81fd5f + a2a327b commit 51e0ec0
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 15 deletions.
52 changes: 42 additions & 10 deletions seqr/views/apis/report_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,21 +416,57 @@ def _add_row(row, family_id, row_type):
airtable_rows = {table: [] for table in AIRTABLE_TABLE_COLUMNS.keys()}
experiment_lookup_rows = []
experiment_ids_by_participant = {}
missing_participant_ids = []
missing_airtable = []
missing_airtable_data_types = defaultdict(list)
missing_seqr_data_types = defaultdict(list)
for participant in participant_rows:
phenotype_rows += _parse_participant_phenotype_rows(participant)
analyte = {k: participant.pop(k) for k in [SMID_FIELD, *ANALYTE_TABLE_COLUMNS[2:]]}
analyte['participant_id'] = participant['participant_id']

if not participant[PARTICIPANT_ID_FIELD]:
missing_participant_ids.append(participant['participant_id'])
continue

airtable_metadata = airtable_metadata_by_participant.get(participant.pop(PARTICIPANT_ID_FIELD)) or {}
data_types = grouped_data_type_individuals[participant['participant_id']]
airtable_participant_id = participant.pop(PARTICIPANT_ID_FIELD)
airtable_metadata = airtable_metadata_by_participant.get(airtable_participant_id)
if not airtable_metadata:
missing_airtable.append(airtable_participant_id)
continue

seqr_data_types = set(grouped_data_type_individuals[participant['participant_id']].keys())
airtable_data_types = {dt.upper() for dt in GREGOR_DATA_TYPES if dt.upper() in airtable_metadata}
for data_type in seqr_data_types - airtable_data_types:
missing_airtable_data_types[data_type].append(airtable_participant_id)
for data_type in airtable_data_types - seqr_data_types:
missing_seqr_data_types[data_type].append(airtable_participant_id)
_parse_participant_airtable_rows(
analyte, airtable_metadata, data_types, experiment_ids_by_participant,
analyte, airtable_metadata, seqr_data_types.intersection(airtable_data_types), experiment_ids_by_participant,
analyte_rows, airtable_rows, experiment_lookup_rows,
)

errors = []
if missing_participant_ids:
errors.append(
f'The following participants are missing {PARTICIPANT_ID_FIELD} for the airtable Sample: '
f'{", ".join(sorted(missing_participant_ids))}'
)
if missing_airtable:
errors.append(
f'The following entries are missing airtable metadata: '
f'{", ".join(sorted(missing_airtable))}'
)
warnings = [
f'The following entries are missing {data_type} airtable data: {", ".join(participants)}'
for data_type, participants in sorted(missing_airtable_data_types.items())
]
warnings += [
f'The following entries have {data_type} airtable data but do not have equivalent loaded data in seqr, so airtable data is omitted: '
f'{", ".join(sorted(participants))}'
for data_type, participants in sorted(missing_seqr_data_types.items())
]

# Add experiment IDs
for variant in genetic_findings_rows:
variant['experiment_id'] = experiment_ids_by_participant.get(variant['participant_id'])
Expand All @@ -445,7 +481,7 @@ def _add_row(row, family_id, row_type):
(FINDINGS_TABLE, genetic_findings_rows),
]

files, warnings, errors = _populate_gregor_files(file_data)
files = _populate_gregor_files(file_data, errors, warnings)

if errors and not request_json.get('overrideValidation'):
raise ErrorsWarningsException(errors, warnings)
Expand Down Expand Up @@ -494,8 +530,6 @@ def _parse_participant_airtable_rows(analyte, airtable_metadata, data_types, exp
smids = analyte.pop(SMID_FIELD)
# airtable data
for data_type in data_types:
if data_type not in airtable_metadata:
continue
is_rna, row = _get_airtable_row(data_type, airtable_metadata)
smids = None
analyte_rows.append({**analyte, **{k: row[k] for k in ANALYTE_TABLE_COLUMNS if k in row}})
Expand Down Expand Up @@ -658,9 +692,7 @@ def _validate_enumeration(val, validator):
DATA_TYPE_FORMATTERS['float'] = DATA_TYPE_FORMATTERS['integer']


def _populate_gregor_files(file_data):
errors = []
warnings = []
def _populate_gregor_files(file_data, errors, warnings):
try:
table_configs, required_tables = _load_data_model_validators()
except Exception as e:
Expand Down Expand Up @@ -715,7 +747,7 @@ def _populate_gregor_files(file_data):
for column, config in table_config.items():
_validate_column_data(column, file_name, data, column_validator=config, warnings=warnings, errors=errors)

return files, warnings, errors
return files


def _load_data_model_validators():
Expand Down
18 changes: 13 additions & 5 deletions seqr/views/apis/report_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,9 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
self.assertEqual(response.status_code, 400)

recommended_warnings = [
'The following entries are missing RNA airtable data: NA19675',
'The following entries are missing WES airtable data: NA19675, NA19679',
'The following entries have WGS airtable data but do not have equivalent loaded data in seqr, so airtable data is omitted: NA19675, NA20888, VCGS_FAM203_621',
'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',
'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
Expand All @@ -862,9 +865,11 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
'The following columns are computed for the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, missing_variant_case, pmid_id',
] + recommended_warnings
self.assertListEqual(response.json()['warnings'], validation_warnings)
missing_participant_error = 'The following participants are missing CollaboratorParticipantID for the airtable Sample: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881'
validation_errors = [
f'No data model found for "{file}" table' for file in reversed(EXPECTED_GREGOR_FILES) if file not in INVALID_MODEL_TABLES
] + [
missing_participant_error,
'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set',
] + [
'The following entries are missing required "prior_testing" in the "participant" table: Broad_HG00731, Broad_HG00732',
Expand Down Expand Up @@ -939,9 +944,10 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
mock_subprocess.reset_mock()
mock_open.reset_mock()
responses.add(responses.GET, MOCK_DATA_MODEL_URL, body=MOCK_DATA_MODEL_RESPONSE, status=200)
body['overrideValidation'] = True
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 200)
expected_response['warnings'] = recommended_warnings
expected_response['warnings'] = [missing_participant_error] + recommended_warnings
self.assertDictEqual(response.json(), expected_response)
self._assert_expected_gregor_files(mock_open, mock_subprocess)
self._test_expected_gregor_airtable_calls()
Expand Down Expand Up @@ -972,10 +978,12 @@ def _test_gregor_export(self, url, mock_subprocess, mock_temp_dir, mock_open, mo
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 200)
expected_response['info'][0] = expected_response['info'][0].replace('9', '10')
expected_response['warnings'][0] = expected_response['warnings'][0] + ', Broad_NA20885, Broad_NA20889'
expected_response['warnings'][1] = expected_response['warnings'][1].replace(', Broad_NA20888', '')
expected_response['warnings'][2] = expected_response['warnings'][2].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
expected_response['warnings'][3] = expected_response['warnings'][3].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
expected_response['warnings'][0] = expected_response['warnings'][0].replace('Broad_NA20881', 'Broad_NA20881, Broad_NA20885, Broad_NA20889')
expected_response['warnings'][3] = expected_response['warnings'][3].replace(', NA20888', '')
expected_response['warnings'][4] = expected_response['warnings'][4] + ', Broad_NA20885, Broad_NA20889'
expected_response['warnings'][5] = expected_response['warnings'][5].replace(', Broad_NA20888', '')
expected_response['warnings'][6] = expected_response['warnings'][6].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
expected_response['warnings'][7] = expected_response['warnings'][7].replace('Broad_NA20888', 'Broad_NA20885, Broad_NA20888, Broad_NA20889')
self.assertDictEqual(response.json(), expected_response)
self._assert_expected_gregor_files(mock_open, mock_subprocess, has_second_project=True)
self._test_expected_gregor_airtable_calls(additional_samples=['NA20885', 'NA20889'], additional_mondo_ids=['0008788'])
Expand Down

0 comments on commit 51e0ec0

Please sign in to comment.