Merge pull request #4494 from broadinstitute/dev

Dev
broadinstitute · Nov 21, 2024 · ef13b05 · ef13b05
2 parents d4df2c2 + 95d0bfd
commit ef13b05
Show file tree

Hide file tree

Showing 58 changed files with 610 additions and 409 deletions.
diff --git a/.codacy.yaml b/.codacy.yaml
@@ -0,0 +1,3 @@
+---
+exclude_paths:
+  - "deploy/postgres/initdb.sql"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## dev
 
+## 11/21/24
+* Migrate "Submit to Clinvar" to generic report flag for Variant Notes (REQUIRES DB MIGRATION)
+
 ## 10/28/24
 * Update RNA Tissue Type choices (REQUIRES DB MIGRATION)
 

diff --git a/deploy/LOCAL_INSTALL.md b/deploy/LOCAL_INSTALL.md
@@ -34,6 +34,7 @@ wget https://raw.githubusercontent.com/broadinstitute/seqr/master/docker-compose
 docker compose up -d seqr   # start up the seqr docker image in the background after also starting other components it depends on (postgres, redis, elasticsearch). This may take 10+ minutes.
 docker compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
 
+docker compose exec seqr python manage.py update_all_reference_data --use-cached-omim  # Intialize reference data
 docker compose exec seqr python manage.py createsuperuser  # create a seqr Admin user 
 
 open http://localhost     # open the seqr landing page in your browser. Log in to seqr using the email and password from the previous step

diff --git a/deploy/docker/seqr/entrypoint.sh b/deploy/docker/seqr/entrypoint.sh
@@ -39,20 +39,11 @@ do
     fi
 done
 
-# init and populate seqrdb unless it already exists
-if ! psql --host "$POSTGRES_SERVICE_HOSTNAME" -U "$POSTGRES_USERNAME" -l | grep seqrdb; then
-    psql --host "$POSTGRES_SERVICE_HOSTNAME" -U "$POSTGRES_USERNAME" -c 'CREATE DATABASE reference_data_db';
-    psql --host "$POSTGRES_SERVICE_HOSTNAME" -U "$POSTGRES_USERNAME" -c 'CREATE DATABASE seqrdb';
-    python -u manage.py migrate
-    python -u manage.py migrate --database=reference_data
-    python -u manage.py loaddata variant_tag_types
-    python -u manage.py update_all_reference_data --use-cached-omim
-else
-    # run any pending migrations if the database already exists
-    python -u manage.py migrate
-    python -u manage.py migrate --database=reference_data
-fi
+# run any pending migrations and load missing data
+python -u manage.py migrate
+python -u manage.py migrate --database=reference_data
 python -u manage.py loaddata variant_searches
+python -u manage.py loaddata variant_tag_types
 
 python -u manage.py check
 

diff --git a/deploy/kubectl_helpers/shell.sh b/deploy/kubectl_helpers/shell.sh
@@ -8,9 +8,4 @@ COMPONENT=$2
 
 POD_NAME=$("${DIR}"/utils/get_pod_name.sh "$@")
 
-case ${COMPONENT} in
-  seqr) CONTAINER=seqr-pod ;;
-  *) CONTAINER=${COMPONENT} ;;
-esac
-
-kubectl exec -it "${POD_NAME}" -c "${CONTAINER}" -- /bin/bash
+kubectl exec -it "${POD_NAME}" -c "${COMPONENT}" -- /bin/bash
diff --git a/deploy/postgres/initdb.sql b/deploy/postgres/initdb.sql
@@ -0,0 +1,2 @@
+CREATE DATABASE seqrdb;
+CREATE DATABASE reference_data_db;
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -8,6 +8,7 @@ services:
       - PGPORT=5433
       - POSTGRES_PASSWORD=docker-compose-postgres-password
     volumes:
+      - ./deploy/postgres/initdb.sql:/docker-entrypoint-initdb.d/initdb.sql
       - ./data/postgres:/var/lib/postgresql/data
     healthcheck:
       test: pg_isready -h postgres -U postgres

diff --git a/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/.README.txt.crc
diff --git a/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/README.txt b/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/README.txt
@@ -1,3 +1,3 @@
 This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
   Written with version 0.2.130-bea04d9c79b5
-  Created at 2024/10/02 14:46:35
+  Created at 2024/11/04 13:45:23
diff --git a/...01a-4640-a3e4-aef656269368.idx/.index.crc → ...a18-42d9-9d62-d4fc646610ac.idx/.index.crc b/...01a-4640-a3e4-aef656269368.idx/.index.crc → ...a18-42d9-9d62-d4fc646610ac.idx/.index.crc
diff --git a/...e4-aef656269368.idx/.metadata.json.gz.crc → ...62-d4fc646610ac.idx/.metadata.json.gz.crc b/...e4-aef656269368.idx/.metadata.json.gz.crc → ...62-d4fc646610ac.idx/.metadata.json.gz.crc
diff --git a/...f66-a01a-4640-a3e4-aef656269368.idx/index → ...2a3-0a18-42d9-9d62-d4fc646610ac.idx/index b/...f66-a01a-4640-a3e4-aef656269368.idx/index → ...2a3-0a18-42d9-9d62-d4fc646610ac.idx/index
diff --git a/...40-a3e4-aef656269368.idx/metadata.json.gz → ...d9-9d62-d4fc646610ac.idx/metadata.json.gz b/...40-a3e4-aef656269368.idx/metadata.json.gz → ...d9-9d62-d4fc646610ac.idx/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/rows/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/SNV_INDEL/families/WGS/F000002_2.ht/rows/metadata.json.gz
diff --git a/...DEL/families/WGS/F000002_2.ht/rows/parts/.part-0-5efaaf66-a01a-4640-a3e4-aef656269368.crc b/...DEL/families/WGS/F000002_2.ht/rows/parts/.part-0-5efaaf66-a01a-4640-a3e4-aef656269368.crc
diff --git a/...DEL/families/WGS/F000002_2.ht/rows/parts/.part-0-d68dd2a3-0a18-42d9-9d62-d4fc646610ac.crc b/...DEL/families/WGS/F000002_2.ht/rows/parts/.part-0-d68dd2a3-0a18-42d9-9d62-d4fc646610ac.crc
diff --git a/...NV_INDEL/families/WGS/F000002_2.ht/rows/parts/part-0-5efaaf66-a01a-4640-a3e4-aef656269368 b/...NV_INDEL/families/WGS/F000002_2.ht/rows/parts/part-0-5efaaf66-a01a-4640-a3e4-aef656269368
diff --git a/...NV_INDEL/families/WGS/F000002_2.ht/rows/parts/part-0-d68dd2a3-0a18-42d9-9d62-d4fc646610ac b/...NV_INDEL/families/WGS/F000002_2.ht/rows/parts/part-0-d68dd2a3-0a18-42d9-9d62-d4fc646610ac
diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
@@ -472,9 +472,7 @@ def _apply_entry_filters(ht):
     def _filter_single_entries_table(self, ht, project_families, inheritance_filter=None, quality_filter=None, is_merged_ht=False, **kwargs):
         ht, sorted_family_sample_data = self._add_entry_sample_families(ht, project_families, is_merged_ht)
         ht = self._filter_quality(ht, quality_filter, **kwargs)
-        ht, ch_ht = self._filter_inheritance(
-            ht, None, inheritance_filter, sorted_family_sample_data,
-        )
+        ht, ch_ht = self._filter_inheritance(ht, None, inheritance_filter, sorted_family_sample_data)
         ht = self._apply_entry_filters(ht)
         ch_ht = self._apply_entry_filters(ch_ht)
 
@@ -574,7 +572,7 @@ def _get_sample_type(cls, family_index, ht_globals):
 
     def _filter_inheritance(
         self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data,
-        annotation='family_entries', entries_ht_field='family_entries'
+        annotation='family_entries', entries_ht_field='family_entries', **kwargs
     ):
         any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)
 
@@ -584,14 +582,14 @@ def _filter_inheritance(
             any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)
 
         ht = ht.annotate(**{
-            annotation: ht[entries_ht_field].map(
+            entries_ht_field: ht[entries_ht_field].map(
                 lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)
             )})
 
         if self._has_comp_het_search:
             comp_het_ht = self._annotate_families_inheritance(
                 comp_het_ht if comp_het_ht is not None else ht, COMPOUND_HET, inheritance_filter,
-                sorted_family_sample_data, annotation, entries_ht_field
+                sorted_family_sample_data, annotation, entries_ht_field, **kwargs
             )
 
         if is_any_affected or not (inheritance_filter or self._inheritance_mode):
@@ -600,15 +598,43 @@ def _filter_inheritance(
 
         ht = None if self._inheritance_mode == COMPOUND_HET else self._annotate_families_inheritance(
             ht, self._inheritance_mode, inheritance_filter, sorted_family_sample_data,
-            annotation, entries_ht_field
+            annotation, entries_ht_field, **kwargs
         )
 
         return ht, comp_het_ht
 
     def _annotate_families_inheritance(
         self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
-        annotation, entries_ht_field,
+        annotation, entries_ht_field, family_passes_inheritance_filter = None
     ):
+        if not family_passes_inheritance_filter:
+            family_passes_inheritance_filter = self._get_family_passes_inheritance_filter
+
+        entry_indices_by_gt = self._get_entry_indices_by_gt_map(
+            inheritance_filter, inheritance_mode, sorted_family_sample_data
+        )
+
+        for genotype, entry_indices in entry_indices_by_gt.items():
+            if not entry_indices:
+                continue
+            entry_indices = hl.dict(entry_indices)
+            ht = ht.annotate(**{
+                annotation: hl.enumerate(ht[entries_ht_field]).starmap(
+                    lambda family_idx, family_samples: family_passes_inheritance_filter(
+                        entry_indices, family_idx, genotype, family_samples, ht, annotation
+                    )
+                )
+            })
+
+        return ht
+
+    def _get_family_passes_inheritance_filter(self, entry_indices, family_idx, genotype, family_samples, *args):
+        return hl.or_missing(
+            ~entry_indices.contains(family_idx) | entry_indices[family_idx].all(
+                lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
+        ), family_samples)
+
+    def _get_entry_indices_by_gt_map(self, inheritance_filter, inheritance_mode, sorted_family_sample_data):
         individual_genotype_filter = (inheritance_filter or {}).get('genotype')
 
         # Create a mapping of genotypes to check against a list of samples for a family
@@ -630,21 +656,7 @@ def _annotate_families_inheritance(
             ]
             self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0
 
-        for genotype, entry_indices in entry_indices_by_gt.items():
-            if not entry_indices:
-                continue
-            entry_indices = hl.dict(entry_indices)
-            ht = ht.annotate(**{
-                annotation: hl.enumerate(ht[entries_ht_field]).starmap(
-                    lambda family_i, family_samples: hl.or_missing(
-                        ~entry_indices.contains(family_i) | entry_indices[family_i].all(
-                            lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
-                        ), family_samples,
-                    ),
-                )
-            })
-
-        return ht
+        return entry_indices_by_gt
 
     def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
         quality_filter = quality_filter or {}

diff --git a/hail_search/queries/mito.py b/hail_search/queries/mito.py
@@ -71,9 +71,6 @@ class MitoHailTableQuery(BaseHailTableQuery):
     CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS + ['rsid']
     MITO_ANNOTATION_FIELDS = {
         'commonLowHeteroplasmy': lambda r: r.common_low_heteroplasmy,
-        'highConstraintRegion': (
-            lambda r: r.high_constraint_region if hasattr(r, 'high_constraint_region') else r.high_constraint_region_mito
-        ),
         'mitomapPathogenic': lambda r: r.mitomap.pathogenic,
     }
     BASE_ANNOTATION_FIELDS = {
@@ -205,37 +202,62 @@ def _filter_entries_ht_both_sample_types(
             )
 
         ch_ht = None
-        family_guid_idx_map = defaultdict(dict)
+        family_idx_map = defaultdict(dict)
         for sample_type, sorted_family_sample_data in sample_types:
+            ht = self._annotate_initial_passes_inheritance(ht, sample_type)
+            ch_ht = self._annotate_initial_passes_inheritance(ch_ht, sample_type)
             ht, ch_ht = self._filter_inheritance(
                 ht, ch_ht, inheritance_filter, sorted_family_sample_data,
-                annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field
+                annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field,
+                family_passes_inheritance_filter=self._get_family_passes_inheritance_filter_both_sample_types
             )
             for family_idx, samples in enumerate(sorted_family_sample_data):
                 family_guid = samples[0]['familyGuid']
-                family_guid_idx_map[family_guid][sample_type.value] = family_idx
+                family_idx_map[family_guid][sample_type.value] = family_idx
 
-        family_idx_map = hl.dict(family_guid_idx_map)
+        family_idx_map = hl.dict(family_idx_map)
         ht = self._apply_multi_sample_type_entry_filters(ht, family_idx_map)
         ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_idx_map)
         return ht, ch_ht
 
+    @staticmethod
+    def _annotate_initial_passes_inheritance(ht, sample_type):
+        if ht is None:
+            return ht
+
+        return ht.annotate(**{
+            sample_type.passes_inheritance_field: ht[sample_type.family_entries_field].map(
+                lambda family_entries: hl.array(
+                    hl.range(0, hl.len(family_entries)).map(lambda _: True)
+                )
+            )})
+
+    def _get_family_passes_inheritance_filter_both_sample_types(
+        self, entry_indices, family_idx, genotype, family_samples, ht, annotation
+    ):
+        return hl.enumerate(ht[annotation][family_idx]).starmap(
+            lambda sample_idx, passes: (hl.case()
+                .when(~entry_indices.get(family_idx).contains(sample_idx), passes)
+                .when(~self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_idx].GT), False)
+                .default(passes))
+        )
+
     def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
         if ht is None:
             return ht
 
-        # Keep family from both sample types if either passes quality AND inheritance
         for sample_type in SampleType:
             ht = ht.annotate(**{
                 sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
-                    lambda i, family_samples: hl.or_missing(
-                        hl.bind(
-                            lambda other_sample_type_idx: (
-                                self._family_has_valid_sample_type_entries(ht, sample_type, i) |
-                                self._family_has_valid_sample_type_entries(ht, sample_type.other_sample_type, other_sample_type_idx)
-                            ),
-                            family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
-                       ), family_samples)
+                    lambda family_idx, family_samples: hl.or_missing(
+                        hl.bind(lambda other_sample_type_family_idx: ((
+                            self._family_has_valid_quality(ht, sample_type, family_idx) |
+                            self._family_has_valid_quality(ht, sample_type.other_sample_type, other_sample_type_family_idx)
+                             ) &
+                            self._family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx) &
+                            self._family_has_valid_inheritance(ht, sample_type.other_sample_type, other_sample_type_family_idx, family_idx)
+                        ), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
+                    ), family_samples)
                 )})
 
         # Merge family entries and filters from both sample types
@@ -253,13 +275,29 @@ def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
         return ht.filter(ht.family_entries.any(hl.is_defined))
 
     @staticmethod
-    def _family_has_valid_sample_type_entries(ht, sample_type, sample_type_family_idx):
-        # Note: This logic does not sufficiently handle case 2 here https://docs.google.com/presentation/d/1hqDV8ulhviUcR5C4PtNUqkCLXKDsc6pccgFVlFmWUAU/edit?usp=sharing
-        # and will need to be changed to support it - https://github.com/broadinstitute/seqr/issues/4403
+    def _family_has_valid_quality(ht, sample_type, sample_type_family_idx):
         return (
             hl.is_defined(sample_type_family_idx) &
-            hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx]) &
-            hl.is_defined(ht[sample_type.passes_inheritance_field][sample_type_family_idx])
+            hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx])
+        )
+
+    @staticmethod
+    def _family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx):
+        return hl.bind(
+            lambda sample_type_fail_samples, other_sample_type_pass_samples: (
+                sample_type_fail_samples.all(other_sample_type_pass_samples.contains)
+            ), hl.enumerate(ht[sample_type.family_entries_field][family_idx]).starmap(
+                lambda sample_idx, sample: hl.or_missing(
+                    ~ht[sample_type.passes_inheritance_field][family_idx][sample_idx],
+                    sample['sampleId'],
+                )
+            ).filter(hl.is_defined),
+            hl.enumerate(ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx]).starmap(
+                lambda sample_idx, sample: hl.or_missing(
+                    ht[sample_type.other_sample_type.passes_inheritance_field][other_sample_type_family_idx][sample_idx],
+                    sample['sampleId'],
+                )
+            ).filter(hl.is_defined),
         )
 
     def _get_sample_genotype(self, samples, r=None, include_genotype_overrides=False, select_fields=None, **kwargs):

diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py
@@ -71,7 +71,13 @@ def _filter_data_type_comp_hets(self, variant_query, variant_families, sv_query)
                 for s in variant_samples_by_family[f]
             ] for f in overlapped_families])
             sv_ht = sv_ht.annotate(family_entries=hl.enumerate(sv_sample_indices).starmap(
-                lambda family_i, indices: indices.map(lambda sample_i: sv_ht.family_entries[family_i][sample_i])
+                lambda family_i, indices: hl.bind(
+                    lambda family_entry: hl.or_missing(
+                        hl.is_defined(family_entry),
+                        indices.map(lambda sample_i: family_entry[sample_i]),
+                    ),
+                    sv_ht.family_entries[family_i],
+                )
             ))
 
         variant_ch_ht = variant_ht.group_by('gene_ids').aggregate(v1=hl.agg.collect(variant_ht.row))

diff --git a/hail_search/requirements-test.txt b/hail_search/requirements-test.txt
@@ -6,7 +6,7 @@
 #
 aiohappyeyeballs==2.3.5
     # via aiohttp
-aiohttp==3.10.2
+aiohttp==3.10.11
     # via pytest-aiohttp
 aiosignal==1.3.1
     # via aiohttp
@@ -34,6 +34,8 @@ packaging==23.1
     # via pytest
 pluggy==1.2.0
     # via pytest
+propcache==0.2.0
+    # via yarl
 pytest==7.4.0
     # via
     #   pytest-aiohttp
@@ -44,5 +46,5 @@ pytest-asyncio==0.21.0
     # via pytest-aiohttp
 tomli==2.0.1
     # via pytest
-yarl==1.9.2
+yarl==1.17.2
     # via aiohttp
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		CREATE DATABASE seqrdb;
		CREATE DATABASE reference_data_db;