Merge pull request #4206 from broadinstitute/dev

Dev
broadinstitute · Jun 27, 2024 · e7144d0 · e7144d0
2 parents 3166cf6 + 64b0520
commit e7144d0
Show file tree

Hide file tree

Showing 51 changed files with 306 additions and 169 deletions.
diff --git a/.github/workflows/hail-search-persistent-volume-snapshot-release.yaml b/.github/workflows/hail-search-persistent-volume-snapshot-release.yaml
@@ -4,20 +4,20 @@ on:
     inputs:
       environment:
         type: choice
-        options: 
+        options:
         - dev
         - prod
       reference_genome:
          type: choice
          description: Reference Genome
-         options: 
+         options:
          - GRCh37
          - GRCh38
          required: true
       dataset_type:
         type: choice
         description: Dataset Type
-        options: 
+        options:
         - SNV_INDEL
         - MITO
         - GCNV
@@ -28,6 +28,9 @@ on:
       volume_handle:
         required: true
 
+env:
+  CHART_NAME: "${{ inputs.environment == 'dev' && 'dev-' || '' }}broad-seqr"
+
 jobs:
   helm_update:
     runs-on: ubuntu-latest
@@ -46,13 +49,13 @@ jobs:
         uses: mikefarah/yq@v4.22.1
         with:
           cmd: >
-            yq -i '.global.hail_search.datasetVersions.${{ inputs.reference_genome }}/${{ inputs.dataset_type }} = "${{ inputs.version }}"' charts/broad-seqr/values-${{ inputs.environment }}.yaml
+            yq -i '.global.hail_search.datasetVersions.${{ inputs.reference_genome }}/${{ inputs.dataset_type }} = "${{ inputs.version }}"' charts/${{ env.CHART_NAME }}/values.yaml
 
       - name: update volume handle in the broad-seqr chart
         uses: mikefarah/yq@v4.22.1
         with:
           cmd: >
-            yq -i '.hail-search.persistentVolume.volumeHandle = "${{ inputs.volume_handle }}"' charts/broad-seqr/values-${{ inputs.environment }}.yaml
+            yq -i '.hail-search.persistentVolume.volumeHandle = "${{ inputs.volume_handle }}"' charts/${{ env.CHART_NAME }}/values.yaml
 
       - name: Commit and Push changes
         uses: Andro999b/push@v1.3

diff --git a/hail_search/constants.py b/hail_search/constants.py
@@ -15,6 +15,10 @@
 SPLICE_AI_FIELD = 'splice_ai'
 NEW_SV_FIELD = 'new_structural_variants'
 SCREEN_KEY = 'SCREEN'  # uses all caps to match filter provided by the seqr UI
+UTR_ANNOTATOR_KEY = 'UTRAnnotator'
+EXTENDED_SPLICE_KEY = 'extended_splice_site'
+MOTIF_FEATURES_KEY = 'motif_feature'
+REGULATORY_FEATURES_KEY = 'regulatory_feature'
 CLINVAR_KEY = 'clinvar'
 CLINVAR_MITO_KEY = 'clinvar_mito'
 HGMD_KEY = 'hgmd'
@@ -23,7 +27,7 @@
 GENOTYPES_FIELD = 'genotypes'
 
 ANNOTATION_OVERRIDE_FIELDS = [
-    SCREEN_KEY, SPLICE_AI_FIELD, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD,
+    SCREEN_KEY, SPLICE_AI_FIELD, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD, MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY,
 ]
 ALLOWED_TRANSCRIPTS = 'allowed_transcripts'
 ALLOWED_SECONDARY_TRANSCRIPTS = 'allowed_transcripts_secondary'

diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/.README.txt.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/.README.txt.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/README.txt b/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/README.txt
@@ -1,3 +1,3 @@
 This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
-  Written with version 0.2.126-ee77707f4fab
-  Created at 2024/01/24 11:38:19
+  Written with version 0.2.128-eead8100a1c1
+  Created at 2024/06/27 14:14:27
diff --git a/...851-4434-9ca6-ba437ff4a1da.idx/.index.crc → ...d1f-474b-b8f9-472cb004e4ce.idx/.index.crc b/...851-4434-9ca6-ba437ff4a1da.idx/.index.crc → ...d1f-474b-b8f9-472cb004e4ce.idx/.index.crc
diff --git a/...a6-ba437ff4a1da.idx/.metadata.json.gz.crc → ...f9-472cb004e4ce.idx/.metadata.json.gz.crc b/...a6-ba437ff4a1da.idx/.metadata.json.gz.crc → ...f9-472cb004e4ce.idx/.metadata.json.gz.crc
diff --git a/...a83-2851-4434-9ca6-ba437ff4a1da.idx/index → ...5c1-cd1f-474b-b8f9-472cb004e4ce.idx/index b/...a83-2851-4434-9ca6-ba437ff4a1da.idx/index → ...5c1-cd1f-474b-b8f9-472cb004e4ce.idx/index
diff --git a/...34-9ca6-ba437ff4a1da.idx/metadata.json.gz → ...4b-b8f9-472cb004e4ce.idx/metadata.json.gz b/...34-9ca6-ba437ff4a1da.idx/metadata.json.gz → ...4b-b8f9-472cb004e4ce.idx/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/rows/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/families/F000002_2.ht/rows/metadata.json.gz
diff --git a/...V_INDEL/families/F000002_2.ht/rows/parts/.part-0-061e05c1-cd1f-474b-b8f9-472cb004e4ce.crc b/...V_INDEL/families/F000002_2.ht/rows/parts/.part-0-061e05c1-cd1f-474b-b8f9-472cb004e4ce.crc
diff --git a/...V_INDEL/families/F000002_2.ht/rows/parts/.part-0-a30c1a83-2851-4434-9ca6-ba437ff4a1da.crc b/...V_INDEL/families/F000002_2.ht/rows/parts/.part-0-a30c1a83-2851-4434-9ca6-ba437ff4a1da.crc
diff --git a/...37/SNV_INDEL/families/F000002_2.ht/rows/parts/part-0-061e05c1-cd1f-474b-b8f9-472cb004e4ce b/...37/SNV_INDEL/families/F000002_2.ht/rows/parts/part-0-061e05c1-cd1f-474b-b8f9-472cb004e4ce
diff --git a/...37/SNV_INDEL/families/F000002_2.ht/rows/parts/part-0-a30c1a83-2851-4434-9ca6-ba437ff4a1da b/...37/SNV_INDEL/families/F000002_2.ht/rows/parts/part-0-a30c1a83-2851-4434-9ca6-ba437ff4a1da
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/.README.txt.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/.README.txt.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/README.txt b/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/README.txt
@@ -1,3 +1,3 @@
 This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
   Written with version 0.2.128-eead8100a1c1
-  Created at 2024/04/03 17:41:01
+  Created at 2024/06/27 14:07:54
diff --git a/...c5e-40db-ba9f-3248f8540152.idx/.index.crc → ...b9b-4fa8-bc1f-ea8fab6fdf0f.idx/.index.crc b/...c5e-40db-ba9f-3248f8540152.idx/.index.crc → ...b9b-4fa8-bc1f-ea8fab6fdf0f.idx/.index.crc
diff --git a/...9f-3248f8540152.idx/.metadata.json.gz.crc → ...1f-ea8fab6fdf0f.idx/.metadata.json.gz.crc b/...9f-3248f8540152.idx/.metadata.json.gz.crc → ...1f-ea8fab6fdf0f.idx/.metadata.json.gz.crc
diff --git a/...af7-4c5e-40db-ba9f-3248f8540152.idx/index → ...2be-ab9b-4fa8-bc1f-ea8fab6fdf0f.idx/index b/...af7-4c5e-40db-ba9f-3248f8540152.idx/index → ...2be-ab9b-4fa8-bc1f-ea8fab6fdf0f.idx/index
diff --git a/...db-ba9f-3248f8540152.idx/metadata.json.gz → ...a8-bc1f-ea8fab6fdf0f.idx/metadata.json.gz b/...db-ba9f-3248f8540152.idx/metadata.json.gz → ...a8-bc1f-ea8fab6fdf0f.idx/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/rows/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/projects/R0001_1kg.ht/rows/metadata.json.gz
diff --git a/...V_INDEL/projects/R0001_1kg.ht/rows/parts/.part-0-7af5daf7-4c5e-40db-ba9f-3248f8540152.crc b/...V_INDEL/projects/R0001_1kg.ht/rows/parts/.part-0-7af5daf7-4c5e-40db-ba9f-3248f8540152.crc
diff --git a/...V_INDEL/projects/R0001_1kg.ht/rows/parts/.part-0-cb4462be-ab9b-4fa8-bc1f-ea8fab6fdf0f.crc b/...V_INDEL/projects/R0001_1kg.ht/rows/parts/.part-0-cb4462be-ab9b-4fa8-bc1f-ea8fab6fdf0f.crc
diff --git a/...37/SNV_INDEL/projects/R0001_1kg.ht/rows/parts/part-0-7af5daf7-4c5e-40db-ba9f-3248f8540152 b/...37/SNV_INDEL/projects/R0001_1kg.ht/rows/parts/part-0-7af5daf7-4c5e-40db-ba9f-3248f8540152
diff --git a/...37/SNV_INDEL/projects/R0001_1kg.ht/rows/parts/part-0-cb4462be-ab9b-4fa8-bc1f-ea8fab6fdf0f b/...37/SNV_INDEL/projects/R0001_1kg.ht/rows/parts/part-0-cb4462be-ab9b-4fa8-bc1f-ea8fab6fdf0f
diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
@@ -148,16 +148,20 @@ def population_expression(self, r, population):
             for response_key, field in pop_config.items() if field is not None
         })
 
-    def _get_enum_lookup(self, field, subfield):
+    def _get_enum_lookup(self, field, subfield, nested_subfield=None):
         enum_field = self._enums.get(field, {})
         if subfield:
             enum_field = enum_field.get(subfield)
+        if nested_subfield:
+            enum_field = enum_field.get(nested_subfield)
         if enum_field is None:
             return None
         return {v: i for i, v in enumerate(enum_field)}
 
-    def _get_enum_terms_ids(self, field, subfield, terms):
-        enum = self._get_enum_lookup(field, subfield)
+    def _get_enum_terms_ids(self, field, subfield, terms, nested_subfield=None):
+        if not terms:
+            return set()
+        enum = self._get_enum_lookup(field, subfield, nested_subfield=nested_subfield)
         return {enum[t] for t in terms if enum.get(t) is not None}
 
     def _format_enum_response(self, k, enum):

diff --git a/hail_search/queries/mito.py b/hail_search/queries/mito.py
@@ -331,7 +331,7 @@ def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
 
         annotation_fields.update({
             'familyGenotypes': lambda r: hl.dict(r.family_entries.map(
-                lambda entries: (entries.first().familyGuid, entries.map(self._get_sample_genotype))
+                lambda entries: (entries.first().familyGuid, entries.filter(hl.is_defined).map(self._get_sample_genotype))
             )),
         })
 

diff --git a/hail_search/queries/ont_snv_indel.py b/hail_search/queries/ont_snv_indel.py
@@ -1,5 +1,6 @@
 from aiohttp.web import HTTPBadRequest
 
+from hail_search.constants import EXTENDED_SPLICE_KEY, UTR_ANNOTATOR_KEY, SCREEN_KEY
 from hail_search.queries.base import BaseHailTableQuery
 from hail_search.queries.snv_indel import SnvIndelHailTableQuery
 from hail_search.queries.snv_indel_37 import SnvIndelHailTableQuery37
@@ -10,13 +11,21 @@ class OntSnvIndelHailTableQuery(SnvIndelHailTableQuery):
     DATA_TYPE = 'ONT_SNV_INDEL'
 
     CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS
+    ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery37.ANNOTATION_OVERRIDE_FIELDS + [SCREEN_KEY]
 
     def _get_loaded_filter_ht(self, *args, **kwargs):
         return None
 
     def _add_project_lookup_data(self, *args, **kwargs):
         raise HTTPBadRequest(reason='Variant lookup is not supported for ONT data')
 
+    def _get_allowed_consequence_ids(self, annotations):
+        return super()._get_allowed_consequence_ids({
+            k: v for k, v in annotations.items() if k not in {EXTENDED_SPLICE_KEY, UTR_ANNOTATOR_KEY}
+        })
+
     @staticmethod
     def _get_allowed_transcripts_filter(allowed_consequence_ids):
-        return SnvIndelHailTableQuery37._get_allowed_transcripts_filter(allowed_consequence_ids)
+        return SnvIndelHailTableQuery37._get_allowed_transcripts_filter(
+            allowed_consequence_ids.get(SnvIndelHailTableQuery37.TRANSCRIPT_CONSEQUENCE_FIELD)
+        )
diff --git a/hail_search/queries/snv_indel.py b/hail_search/queries/snv_indel.py
@@ -1,7 +1,8 @@
 from collections import OrderedDict
 import hail as hl
 
-from hail_search.constants import GENOME_VERSION_GRCh38, SCREEN_KEY, PREFILTER_FREQ_CUTOFF, ALPHAMISSENSE_SORT
+from hail_search.constants import GENOME_VERSION_GRCh38, SCREEN_KEY, PREFILTER_FREQ_CUTOFF, ALPHAMISSENSE_SORT, \
+    UTR_ANNOTATOR_KEY, EXTENDED_SPLICE_KEY, MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY
 from hail_search.queries.base import BaseHailTableQuery, PredictionPath
 from hail_search.queries.snv_indel_37 import SnvIndelHailTableQuery37
 
@@ -19,7 +20,9 @@ class SnvIndelHailTableQuery(SnvIndelHailTableQuery37):
         'gnomad_noncoding': PredictionPath('gnomad_non_coding_constraint', 'z_score'),
     }
     LIFTOVER_ANNOTATION_FIELDS = BaseHailTableQuery.LIFTOVER_ANNOTATION_FIELDS
-    ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery37.ANNOTATION_OVERRIDE_FIELDS + [SCREEN_KEY]
+    ANNOTATION_OVERRIDE_FIELDS = SnvIndelHailTableQuery37.ANNOTATION_OVERRIDE_FIELDS + [
+        SCREEN_KEY, MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY,
+    ]
     FREQUENCY_PREFILTER_FIELDS = OrderedDict([
         (True, PREFILTER_FREQ_CUTOFF),
         ('is_gt_3_percent', 0.03),
@@ -37,21 +40,42 @@ class SnvIndelHailTableQuery(SnvIndelHailTableQuery37):
     }
 
     def _get_allowed_consequence_ids(self, annotations):
-        consequence_ids = super()._get_allowed_consequence_ids(annotations)
-        if EXTENDED_SPLICE_REGION_CONSEQUENCE in (annotations.get('extended_splice_site') or []):
-            consequence_ids.add(EXTENDED_SPLICE_REGION_CONSEQUENCE)
-        return consequence_ids
+        parsed_allowed_consequences = {}
+        allowed_consequence_ids = super()._get_allowed_consequence_ids(annotations)
+        if allowed_consequence_ids:
+            parsed_allowed_consequences[self.TRANSCRIPT_CONSEQUENCE_FIELD] = allowed_consequence_ids
+
+        utr_consequence_ids = self._get_enum_terms_ids(
+            self.TRANSCRIPTS_FIELD, subfield='utrannotator', nested_subfield='fiveutr_consequence',
+            terms=(annotations.get(UTR_ANNOTATOR_KEY) or []),
+        )
+        if utr_consequence_ids:
+            parsed_allowed_consequences[UTR_ANNOTATOR_KEY] = utr_consequence_ids
+
+        if EXTENDED_SPLICE_REGION_CONSEQUENCE in (annotations.get(EXTENDED_SPLICE_KEY) or []):
+            parsed_allowed_consequences[EXTENDED_SPLICE_REGION_CONSEQUENCE] = True
+
+        return parsed_allowed_consequences
 
     @staticmethod
     def _get_allowed_transcripts_filter(allowed_consequence_ids):
-        has_extended_splice = EXTENDED_SPLICE_REGION_CONSEQUENCE in allowed_consequence_ids
-        allowed_consequence_ids = allowed_consequence_ids - {EXTENDED_SPLICE_REGION_CONSEQUENCE}
-        allowed_consequence_filter = SnvIndelHailTableQuery37._get_allowed_transcripts_filter(allowed_consequence_ids)
+        allowed_consequence_filters = []
 
-        if not has_extended_splice:
-            return allowed_consequence_filter
+        consequence_ids = allowed_consequence_ids.get(SnvIndelHailTableQuery37.TRANSCRIPT_CONSEQUENCE_FIELD)
+        if consequence_ids:
+            allowed_consequence_filters.append(SnvIndelHailTableQuery37._get_allowed_transcripts_filter(consequence_ids))
 
-        return lambda tc: allowed_consequence_filter(tc) | tc.spliceregion.extended_intronic_splice_region_variant
+        utr_consequences = allowed_consequence_ids.get(UTR_ANNOTATOR_KEY)
+        if utr_consequences:
+            utr_consequences = hl.set(utr_consequences)
+            allowed_consequence_filters.append(lambda tc: utr_consequences.contains(tc.utrannotator.fiveutr_consequence_id))
+
+        if allowed_consequence_ids.get(EXTENDED_SPLICE_REGION_CONSEQUENCE):
+            allowed_consequence_filters.append(lambda tc: tc.spliceregion.extended_intronic_splice_region_variant)
+
+        return allowed_consequence_filters[0] if len(allowed_consequence_filters) == 1 else lambda tc: hl.any([
+            f(tc) for f in allowed_consequence_filters
+        ])
 
     def _get_annotation_override_filters(self, ht, annotation_overrides):
         annotation_filters = super()._get_annotation_override_filters(ht, annotation_overrides)
@@ -60,4 +84,14 @@ def _get_annotation_override_filters(self, ht, annotation_overrides):
             allowed_consequences = hl.set(self._get_enum_terms_ids(SCREEN_KEY.lower(), 'region_type', annotation_overrides[SCREEN_KEY]))
             annotation_filters.append(allowed_consequences.contains(ht.screen.region_type_ids.first()))
 
+        for feature_key in [MOTIF_FEATURES_KEY, REGULATORY_FEATURES_KEY]:
+            if annotation_overrides.get(feature_key):
+                field = f'sorted_{feature_key}_consequences'
+                allowed_consequences = hl.set(self._get_enum_terms_ids(
+                    field, self.TRANSCRIPT_CONSEQUENCE_FIELD, annotation_overrides[feature_key]),
+                )
+                annotation_filters.append(
+                    ht[field].any(lambda c: c.consequence_term_ids.any(allowed_consequences.contains))
+                )
+
         return annotation_filters
diff --git a/hail_search/test_search.py b/hail_search/test_search.py
@@ -82,9 +82,6 @@
         'I000004_hg00731': {
             'sampleId': 'HG00731', 'sampleType': 'WGS', 'individualGuid': 'I000004_hg00731',
             'familyGuid': 'F000002_2', 'numAlt': 2, 'dp': 16, 'gq': 48, 'ab': 1,
-        }, 'I000005_hg00732': {
-            'sampleId': 'HG00732', 'sampleType': 'WGS', 'individualGuid': 'I000005_hg00732',
-            'familyGuid': 'F000002_2', 'numAlt': 0, 'dp': 2, 'gq': 6, 'ab': 0,
         }, 'I000006_hg00733': {
             'sampleId': 'HG00733', 'sampleType': 'WGS', 'individualGuid': 'I000006_hg00733',
             'familyGuid': 'F000002_2', 'numAlt': 1, 'dp': 49, 'gq': 99, 'ab': 0.6530612111091614,
@@ -837,9 +834,10 @@ async def test_annotations_filter(self):
         )
 
         pathogenicity['clinvar'] = pathogenicity['clinvar'][:1]
-        annotations = {'SCREEN': ['CTCF-only', 'DNase-only']}
+        annotations = {'SCREEN': ['CTCF-only', 'DNase-only'], 'UTRAnnotator': ['5_prime_UTR_stop_codon_loss_variant']}
+        selected_transcript_variant_2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000408919'}
         await self._assert_expected_search(
-            [VARIANT1, VARIANT4, MITO_VARIANT3], pathogenicity=pathogenicity, annotations=annotations,
+            [VARIANT1, selected_transcript_variant_2, VARIANT4, MITO_VARIANT3], pathogenicity=pathogenicity, annotations=annotations,
             sample_data=FAMILY_2_ALL_SAMPLE_DATA,
         )
 
@@ -896,6 +894,11 @@ async def test_annotations_filter(self):
             pathogenicity=pathogenicity, annotations=annotations, sample_data=FAMILY_2_ALL_SAMPLE_DATA,
         )
 
+        annotations = {'motif_feature': ['TF_binding_site_variant'], 'regulatory_feature': ['regulatory_region_variant']}
+        await self._assert_expected_search(
+            [VARIANT3, VARIANT4], annotations=annotations, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA,
+        )
+
     async def test_secondary_annotations_filter(self):
         annotations_1 = {'missense': ['missense_variant']}
         annotations_2 = {'other': ['intron_variant']}

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -53,7 +53,7 @@ pytz==2022.7.1
     #   django
 rcssmin==1.1.1
     # via django-compressor
-requests==2.32.0
+requests==2.32.2
     # via
     #   -c requirements.txt
     #   responses
@@ -74,7 +74,7 @@ tomli==2.0.1
     #   pyproject-hooks
 types-toml==0.10.8.5
     # via responses
-urllib3==1.26.18
+urllib3==1.26.19
     # via
     #   -c requirements.txt
     #   requests

diff --git a/requirements.txt b/requirements.txt
@@ -175,7 +175,7 @@ tenacity==8.3.0
     # via -r requirements.in
 tqdm==4.66.3
     # via -r requirements.in
-urllib3==1.26.18
+urllib3==1.26.19
     # via
     #   elasticsearch
     #   requests

diff --git a/seqr/views/apis/analysis_group_api.py b/seqr/views/apis/analysis_group_api.py
@@ -55,6 +55,7 @@ def update_analysis_group_handler(request, project_guid, analysis_group_guid=Non
     valid_families = set()
 
     def _validate_families(request_json):
+        request_json.pop('uploadedFamilyIds', None)
         family_guids = request_json.pop('familyGuids')
         families = Family.objects.filter(guid__in=family_guids).only('guid')
         if len(families) != len(family_guids):

diff --git a/seqr/views/apis/analysis_group_api_tests.py b/seqr/views/apis/analysis_group_api_tests.py
@@ -30,7 +30,9 @@ def test_create_update_and_delete_analysis_group(self):
 
         # send valid request to create analysis_group
         response = self.client.post(create_analysis_group_url, content_type='application/json', data=json.dumps({
-            'name': 'new_analysis_group', 'familyGuids': ['F000001_1', 'F000002_2']
+            'name': 'new_analysis_group', 'familyGuids': ['F000001_1', 'F000002_2'], 'uploadedFamilyIds': {
+                'info': ["Uploaded 2 families"], 'parsedData': [['F000001_1'], ['F000002_2']],
+            },
         }))
         self.assertEqual(response.status_code, 200)
         new_analysis_group_response = response.json()

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
@@ -300,7 +300,7 @@ def _save_sample_data(sample_key, sample_data):
         )
 
     if sample_guids_to_keys:
-        persist_temp_file(file_name_prefix, request.user, is_directory=True)
+        persist_temp_file(file_name_prefix, request.user)
 
     return create_json_response({
         'info': info,

diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
@@ -1525,7 +1525,7 @@ def _get_expected_read_file_subprocess_calls(self, file_name, sample_guid):
 
     @staticmethod
     def _additional_expected_loading_subprocess_calls(file_path):
-        return [f'gsutil mv tmp/temp_uploads/{file_path}/* gs://seqr-scratch-temp/{file_path}']
+        return [f'gsutil mv tmp/temp_uploads/{file_path} gs://seqr-scratch-temp/{file_path}']
 
     def _assert_expected_es_status(self, response):
         self.assertEqual(response.status_code, 400)

diff --git a/seqr/views/apis/summary_data_api.py b/seqr/views/apis/summary_data_api.py
@@ -182,19 +182,31 @@ def bulk_update_family_external_analysis(request):
 
 def _load_aip_data(data: dict, user: User):
     category_map = data['metadata']['categories']
+    projects = data['metadata'].get('projects')
     results = data['results']
 
-    family_id_map = dict(Individual.objects.filter(
-        family__project__in=get_internal_projects(), individual_id__in=results.keys(),
-    ).values_list('individual_id', 'family_id'))
+    if not projects:
+        raise ErrorsWarningsException(['No projects specified in the metadata'])
+
+    family_id_map = defaultdict(list)
+    for individual_id, family_id in Individual.objects.filter(
+        family__project__in=get_internal_projects().filter(name__in=projects), individual_id__in=results.keys(),
+    ).values_list('individual_id', 'family_id'):
+        family_id_map[individual_id].append(family_id)
+    errors = []
     missing_individuals = set(results.keys()) - set(family_id_map.keys())
     if missing_individuals:
-        raise ErrorsWarningsException([f'Unable to find the following individuals: {", ".join(sorted(missing_individuals))}'])
+        errors.append(f'Unable to find the following individuals: {", ".join(sorted(missing_individuals))}')
+    multi_family_individuals = {individual_id for individual_id, families in family_id_map.items() if len(families) > 1}
+    if multi_family_individuals:
+        errors.append(f'The following individuals are found in multiple families: {", ".join(sorted(multi_family_individuals))}')
+    if errors:
+        raise ErrorsWarningsException(errors)
 
     family_variant_data = {}
     for family_id, variant_pred in results.items():
         family_variant_data.update({
-            (family_id_map[family_id], variant_id): pred for variant_id, pred in variant_pred.items()
+            (family_id_map[family_id][0], variant_id): pred for variant_id, pred in variant_pred.items()
         })
 
     today = datetime.now().strftime('%Y-%m-%d')

diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py
@@ -505,9 +505,19 @@ def test_bulk_update_family_external_analysis(self, mock_load_uploaded_file, moc
         body['dataType'] = 'AIP'
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 400)
+        self.assertEqual(response.json()['errors'], ['No projects specified in the metadata'])
+
+        aip_upload['metadata']['projects'] = ['1kg project nåme with uniçøde', 'Test Reprocessed Project']
+        response = self.client.post(url, content_type='application/json', data=json.dumps(body))
+        self.assertEqual(response.status_code, 400)
         self.assertEqual(response.json()['errors'], ['Unable to find the following individuals: SAM_123'])
 
-        aip_upload['results']['NA20889'] = aip_upload['results'].pop('SAM_123')
+        aip_upload['results']['NA20870'] = aip_upload['results'].pop('SAM_123')
+        response = self.client.post(url, content_type='application/json', data=json.dumps(body))
+        self.assertEqual(response.status_code, 400)
+        self.assertEqual(response.json()['errors'], ['The following individuals are found in multiple families: NA20870'])
+
+        aip_upload['results']['NA20889'] = aip_upload['results'].pop('NA20870')
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 400)
         self.assertEqual(response.json()['errors'], [