From 3530f06ac635d2c89d0a67ec00126a7407f99ed8 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 21 Jan 2025 13:11:21 -0500 Subject: [PATCH] group lookup calls by chromosome --- .../check_for_new_samples_from_pipeline.py | 20 ++++++++++++------- ...eck_for_new_samples_from_pipeline_tests.py | 2 +- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index a05c36cee6..531aec0f76 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -338,13 +338,19 @@ def _reload_shared_variant_annotations(data_type, genome_version, updated_varian } fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys()) if fetch_variant_ids: - if not is_sv: - fetch_variant_ids = [parse_valid_variant_id(variant_id) for variant_id in fetch_variant_ids] - fetch_variant_ids.sort() - for i in range(0, len(fetch_variant_ids), MAX_LOOKUP_VARIANTS): - updated_variants = hail_variant_multi_lookup(USER_EMAIL, fetch_variant_ids[i:i+MAX_LOOKUP_VARIANTS], data_type, genome_version) - logger.info(f'Fetched {len(updated_variants)} additional variants') - updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants}) + if is_sv: + variant_ids_by_chrom = {'all': fetch_variant_ids} + else: + variant_ids_by_chrom = defaultdict(list) + for variant_id in fetch_variant_ids: + parsed_id = parse_valid_variant_id(variant_id) + variant_ids_by_chrom[parsed_id[0]].append(parsed_id) + for chrom, variant_ids in variant_ids_by_chrom.items(): + variant_ids = sorted(variant_ids) + for i in range(0, len(variant_ids), MAX_LOOKUP_VARIANTS): + updated_variants = hail_variant_multi_lookup(USER_EMAIL, variant_ids[i:i+MAX_LOOKUP_VARIANTS], data_type, genome_version) + logger.info(f'Fetched {len(updated_variants)} additional variants in chromosome {chrom}') + updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants}) updated_variant_models = [] for variant_id, variant in updated_variants_by_id.items(): diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 5d92f0f28c..60a336f82b 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -423,7 +423,7 @@ def test_command(self, mock_email, mock_airtable_utils, mock_open_write_file, mo {'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889', 'sample_type': 'WES'}, ]}}, ], reload_annotations_logs=[ - 'Reloading shared annotations for 3 SNV_INDEL GRCh38 saved variants (3 unique)', 'Fetched 1 additional variants', 'Fetched 1 additional variants', 'Updated 2 SNV_INDEL GRCh38 saved variants', + 'Reloading shared annotations for 3 SNV_INDEL GRCh38 saved variants (3 unique)', 'Fetched 1 additional variants in chromosome 1', 'Fetched 1 additional variants in chromosome 1', 'Updated 2 SNV_INDEL GRCh38 saved variants', 'No additional SV_WES GRCh38 saved variants to update', ], run_loading_logs={ 'GRCh38/SNV_INDEL': 'Loading 4 WES SNV_INDEL samples in 2 projects',