Merge pull request #4561 from broadinstitute/dev

Dev
broadinstitute · Dec 20, 2024 · 2bc28c2 · 2bc28c2
2 parents 8383240 + 32327f0
commit 2bc28c2
Show file tree

Hide file tree

Showing 48 changed files with 472 additions and 119 deletions.
diff --git a/.github/workflows/vlm-unit-tests.yaml b/.github/workflows/vlm-unit-tests.yaml
@@ -10,11 +10,13 @@ on:
     paths:
       - 'vlm/**'
       - '.github/workflows/*vlm*.yaml'
+      - 'hail_search/fixtures/*'
   pull_request:
     types: [opened, synchronize, reopened]
     paths:
       - 'vlm/**'
       - '.github/workflows/*vlm*.yaml'
+      - 'hail_search/fixtures/*'
 
 jobs:
   vlm:
@@ -29,5 +31,10 @@ jobs:
           pip install -r hail_search/requirements-test.txt
       - name: Run coverage tests
         run: |
+          export VLM_DATA_DIR=./hail_search/fixtures
+          export SEQR_BASE_URL=https://test-seqr.org/
+          export NODE_ID=TestVLM
+          export MACHINE_MEM=24
+          export JAVA_OPTS_XSS=16M
           coverage run --source="./vlm" --omit="./vlm/__main__.py" -m pytest vlm/
-          coverage report --fail-under=90
+          coverage report --fail-under=95
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.README.txt.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/README.txt b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/README.txt
@@ -1,3 +1,3 @@
 This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
-  Written with version 0.2.133-4c60fddb171a
-  Created at 2024/12/04 13:07:33
+  Written with version 0.2.128-eead8100a1c1
+  Created at 2024/12/05 16:23:46
diff --git a/...SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.index.crc b/...SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.index.crc
diff --git a/...nnotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.metadata.json.gz.crc b/...nnotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/.metadata.json.gz.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/index b/...Ch37/SNV_INDEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/index
diff --git a/...DEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/metadata.json.gz b/...DEL/annotations.ht/index/part-0-32c79149-c8dd-4b54-8db9-097b88a68456.idx/metadata.json.gz
diff --git a/...SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.index.crc b/...SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.index.crc
diff --git a/...nnotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.metadata.json.gz.crc b/...nnotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/.metadata.json.gz.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/index b/...Ch37/SNV_INDEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/index
diff --git a/...DEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/metadata.json.gz b/...DEL/annotations.ht/index/part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.idx/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/metadata.json.gz
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/.metadata.json.gz.crc
diff --git a/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh37/SNV_INDEL/annotations.ht/rows/metadata.json.gz
diff --git a/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-32c79149-c8dd-4b54-8db9-097b88a68456.crc b/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-32c79149-c8dd-4b54-8db9-097b88a68456.crc
diff --git a/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.crc b/...Ch37/SNV_INDEL/annotations.ht/rows/parts/.part-0-8dc59bd2-da29-46c0-badd-a77a850af2d4.crc
diff --git a/...rt-0-32c79149-c8dd-4b54-8db9-097b88a68456 → ...rt-0-8dc59bd2-da29-46c0-badd-a77a850af2d4 b/...rt-0-32c79149-c8dd-4b54-8db9-097b88a68456 → ...rt-0-8dc59bd2-da29-46c0-badd-a77a850af2d4
diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
@@ -697,7 +697,7 @@ def passes_quality_field(gt):
     def _passes_vcf_filters(gt):
         return hl.is_missing(gt.filters) | (gt.filters.length() < 1)
 
-    def _parse_variant_keys(self, variant_keys=None, **kwargs):
+    def _parse_variant_keys(self, variant_keys):
         return [hl.struct(**{self.KEY_FIELD[0]: key}) for key in (variant_keys or [])]
 
     def _prefilter_entries_table(self, ht, **kwargs):
@@ -728,12 +728,15 @@ def _filter_rs_ids(self, ht, rs_ids):
         rs_id_set = hl.set(rs_ids)
         return ht.filter(rs_id_set.contains(ht.rsid))
 
-    def _parse_intervals(self, intervals, gene_ids=None, **kwargs):
-        parsed_variant_keys = self._parse_variant_keys(**kwargs)
+    def _parse_intervals(self, intervals, gene_ids=None, variant_keys=None, variant_ids=None, **kwargs):
+        parsed_variant_keys = self._parse_variant_keys(variant_keys)
         if parsed_variant_keys:
             self._load_table_kwargs['variant_ht'] = hl.Table.parallelize(parsed_variant_keys).key_by(*self.KEY_FIELD)
             return intervals
 
+        if variant_ids:
+            intervals = [(chrom, pos, pos+1) for chrom, pos, _, _ in variant_ids]
+
         is_x_linked = self._inheritance_mode == X_LINKED_RECESSIVE
         if not (intervals or is_x_linked):
             return intervals
@@ -1216,34 +1219,36 @@ def gene_counts(self):
             ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families))
         ))
 
-    def lookup_variants(self, variant_ids, include_project_data=False, **kwargs):
+    def _filter_variant_ids(self, ht, variant_ids):
+        return ht
+
+    def lookup_variants(self, variant_ids):
         self._parse_intervals(intervals=None, variant_ids=variant_ids, variant_keys=variant_ids)
         ht = self._read_table('annotations.ht', drop_globals=['versions'])
+        ht = self._filter_variant_ids(ht, variant_ids)
         ht = ht.filter(hl.is_defined(ht[XPOS]))
 
-        annotation_fields = self.annotation_fields(include_genotype_overrides=False)
-        include_sample_annotations = False
-        if include_project_data:
-            ht, include_sample_annotations = self._add_project_lookup_data(ht, annotation_fields, **kwargs)
-        if not include_sample_annotations:
-            annotation_fields = {
-                k: v for k, v in annotation_fields.items()
-                if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD}
-            }
-
+        annotation_fields = {
+            k: v for k, v in self.annotation_fields(include_genotype_overrides=False).items()
+            if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD}
+        }
         formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False)
 
         return formatted.aggregate(hl.agg.take(formatted.row, len(variant_ids)))
 
-    def _add_project_lookup_data(self, ht, annotation_fields, include_sample_annotations=False, project_samples=None, **kwargs):
-        if project_samples:
-            projects_ht, _ = self._import_and_filter_multiple_project_hts(project_samples, n_partitions=1)
-            ht = ht.annotate(**projects_ht[ht.key])
+    def _import_variant_projects_ht(self, variant_id, project_samples=None, **kwargs):
+        projects_ht, _ = self._import_and_filter_multiple_project_hts(project_samples, n_partitions=1)
+        return self._filter_variant_ids(projects_ht, [variant_id]).key_by()
 
-        return ht, include_sample_annotations
+    def _get_variant_project_data(self, variant_id, **kwargs):
+        projects_ht = self._import_variant_projects_ht(variant_id, **kwargs)
+        project_data = projects_ht.aggregate(hl.agg.take(projects_ht.row, 1))
+        return project_data[0] if project_data else {}
 
     def lookup_variant(self, variant_id, **kwargs):
-        variants = self.lookup_variants([variant_id], include_project_data=True, **kwargs)
+        variants = self.lookup_variants([variant_id])
         if not variants:
             raise HTTPNotFound()
-        return dict(variants[0])
+        variant = dict(variants[0])
+        variant.update(self._get_variant_project_data(variant_id, **kwargs))
+        return variant
diff --git a/hail_search/queries/mito.py b/hail_search/queries/mito.py
@@ -394,30 +394,25 @@ def _filter_variant_ids(self, ht, variant_ids):
             variant_id_q = ht.alleles == [variant_ids[0][2], variant_ids[0][3]]
         else:
             variant_id_q = hl.any([
-                (ht.locus == hl.locus(chrom, pos, reference_genome=self.GENOME_VERSION)) &
+                (ht.locus == hl.locus(f'chr{chrom}' if self._should_add_chr_prefix() else chrom, pos, reference_genome=self.GENOME_VERSION)) &
                 (ht.alleles == [ref, alt])
                 for chrom, pos, ref, alt in variant_ids
             ])
         return ht.filter(variant_id_q)
 
-    def _parse_variant_keys(self, variant_ids=None, **kwargs):
-        if not variant_ids:
-            return variant_ids
+    def _parse_variant_keys(self, variant_keys):
+        return None
 
-        return [
-            hl.struct(
-                locus=hl.locus(f'chr{chrom}' if self._should_add_chr_prefix() else chrom, pos, reference_genome=self.GENOME_VERSION),
-                alleles=[ref, alt],
-            ) for chrom, pos, ref, alt in variant_ids
-        ]
-
-    def _prefilter_entries_table(self, ht, parsed_intervals=None, exclude_intervals=False, **kwargs):
+    def _prefilter_entries_table(self, ht, parsed_intervals=None, exclude_intervals=False, variant_ids=None, **kwargs):
         num_intervals = len(parsed_intervals or [])
         if exclude_intervals and parsed_intervals:
             ht = hl.filter_intervals(ht, parsed_intervals, keep=False)
         elif num_intervals >= MAX_LOAD_INTERVALS:
             ht = hl.filter_intervals(ht, parsed_intervals)
 
+        if variant_ids:
+            ht = self._filter_variant_ids(ht, variant_ids)
+
         if '_n_partitions' not in self._load_table_kwargs and num_intervals > self._n_partitions:
             ht = ht.naive_coalesce(self._n_partitions)
 
@@ -513,10 +508,11 @@ def _omim_sort(cls, r, omim_gene_set):
     def _gene_rank_sort(cls, r, gene_ranks):
         return [gene_ranks.get(r.selected_transcript.gene_id)] + super()._gene_rank_sort(r, gene_ranks)
 
-    def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
+    def _import_variant_projects_ht(self, variant_id, *args, **kwargs):
         # Get all the project-families for the looked up variant formatted as a dict of dicts:
         # {<project_guid>: {<sample_type>: {<family_guid>: True}, <sample_type_2>: {<family_guid_2>: True}}, <project_guid_2>: ...}
         lookup_ht = self._read_table('lookup.ht', skip_missing_field='project_stats')
+        lookup_ht = self._filter_variant_ids(lookup_ht, [variant_id])
         if lookup_ht is None:
             raise HTTPNotFound()
         variant_projects = lookup_ht.aggregate(hl.agg.take(
@@ -536,22 +532,20 @@ def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
                 lambda project_data: hl.dict(project_data.starmap(
                     lambda project_key, families: (project_key[1], families)
             )))), 1)
-        )[0]
+        )
 
         # Variant can be present in the lookup table with only ref calls, so is still not present in any projects
-        if not variant_projects:
+        if not (variant_projects and variant_projects[0]):
             raise HTTPNotFound()
+        variant_projects = variant_projects[0]
 
         self._has_both_sample_types = True
-        annotation_fields.update({
-            'familyGenotypes': lambda r: hl.dict(r.family_entries.map(
-                lambda entries: (entries.first().familyGuid, self._get_sample_genotype(entries.filter(hl.is_defined)))
-            )),
-        })
-
         logger.info(f'Looking up {self.DATA_TYPE} variant in {len(variant_projects)} projects')
 
-        return super()._add_project_lookup_data(ht, annotation_fields, project_samples=variant_projects, **kwargs)
+        projects_ht = super()._import_variant_projects_ht(variant_id, project_samples=variant_projects)
+        return projects_ht.select(familyGenotypes=hl.dict(projects_ht.family_entries.map(
+            lambda entries: (entries.first().familyGuid, self._get_sample_genotype(entries.filter(hl.is_defined)))
+        )))
 
     @staticmethod
     def _stat_has_non_ref(s):

diff --git a/hail_search/queries/sv.py b/hail_search/queries/sv.py
@@ -1,7 +1,8 @@
 import hail as hl
 
 
-from hail_search.constants import CONSEQUENCE_SORT, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD
+from hail_search.constants import CONSEQUENCE_SORT, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD, FAMILY_GUID_FIELD, \
+    GENOTYPES_FIELD
 from hail_search.queries.base import BaseHailTableQuery, PredictionPath
 
 
@@ -132,8 +133,9 @@ def _additional_annotation_fields(self):
             )),
         }
 
-    def _add_project_lookup_data(self, *args, sample_data=None, **kwargs):
+    def _import_variant_projects_ht(self, variant_id, sample_data=None, **kwargs):
         project_samples, _ = self._parse_sample_data(sample_data)
-        return super()._add_project_lookup_data(
-            *args, include_sample_annotations=True, project_samples=project_samples, **kwargs,
-        )
+        projects_ht = super()._import_variant_projects_ht(variant_id, project_samples=project_samples)
+
+        annotation_fields = self.annotation_fields(include_genotype_overrides=False)
+        return projects_ht.select(**{k: annotation_fields[k](projects_ht) for k in [FAMILY_GUID_FIELD, GENOTYPES_FIELD]})
diff --git a/hail_search/test_search.py b/hail_search/test_search.py
@@ -634,7 +634,7 @@ async def test_location_search(self):
 
         await self._assert_expected_search(
             [SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT],  omit_data_type='SV_WES',
-            intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1]
+            intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][1:]
         )
 
         await self._assert_expected_search(
@@ -900,7 +900,7 @@ async def test_annotations_filter(self):
 
         await self._assert_expected_search(
             [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT],
-            gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_data_type='SV_WES',
+            gene_ids=LOCATION_SEARCH['gene_ids'][1:], annotations=annotations, omit_data_type='SV_WES',
         )
 
         annotations['other'] = annotations['other'][:1]

diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py
@@ -1034,7 +1034,7 @@
 }
 
 LOCATION_SEARCH = {
-    'gene_ids': ['ENSG00000177000', 'ENSG00000097046'],
+    'gene_ids': ['ENSG00000097046', 'ENSG00000177000'],
     'intervals': [['2', 1234, 5678], ['7', 1, 11100], ['1', 11785723, 11806455], ['1', 91500851, 91525764]],
 }
 EXCLUDE_LOCATION_SEARCH = {'intervals': LOCATION_SEARCH['intervals'], 'exclude_intervals': True}

diff --git a/reference_data/management/commands/utils/download_utils.py b/reference_data/management/commands/utils/download_utils.py
@@ -44,4 +44,3 @@ def _get_remote_file_size(url):
     except Exception:
         # file size not yet implemented for FTP and other protocols, and HEAD not supported for all http requests
         return 0
-
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -22,7 +22,7 @@ click==8.1.3
     # via pip-tools
 coverage==5.1
     # via -r requirements-dev.in
-django==4.2.16
+django==4.2.17
     # via
     #   -c requirements.txt
     #   django-appconf
@@ -83,4 +83,3 @@ wheel==0.38.4
 # The following packages are considered to be unsafe in a requirements file:
 # pip
 # setuptools
-zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/requirements.txt b/requirements.txt
@@ -26,7 +26,7 @@ defusedxml==0.7.1
     # via
     #   python3-openid
     #   social-auth-core
-django==4.2.16
+django==4.2.17
     # via
     #   -r requirements.in
     #   django-anymail
@@ -182,3 +182,4 @@ urllib3==1.26.19
     #   requests
 whitenoise==6.3.0
     # via -r requirements.in
+zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -166,28 +166,28 @@ def _load_new_samples(cls, metadata_path, genome_version, dataset_type, run_vers
         failed_families_by_guid = {f['guid']: f for f in Family.objects.filter(
             guid__in={family for families in failed_family_samples.values() for family in families}
         ).values('guid', 'family_id', 'project__name')}
+        failures_by_project_check = defaultdict(lambda: defaultdict(list))
         for check, check_failures in failed_family_samples.items():
-            failures_by_project = defaultdict(list)
             for family_guid, failure_data in check_failures.items():
                 family = failed_families_by_guid[family_guid]
-                failures_by_project[family['project__name']].append(
+                failures_by_project_check[family['project__name']][check].append(
                     f'- {family["family_id"]}: {"; ".join(failure_data["reasons"])}'
                 )
-            for project, failures in failures_by_project.items():
+        for project, failures_by_check in failures_by_project_check.items():
+            messages = [f'Encountered the following errors loading {project}:']
+            for check, failures in failures_by_check.items():
                 summary = '\n'.join(sorted(failures))
-                split_pdos = split_project_pdos.get(project)
-                if split_pdos:
-                    summary += f'\n\nSkipped samples in this project have been moved to {", ".join(split_pdos)}'
-
-                relatedness_check_message = (
-                    f'\nRelatedness check results: {relatedness_check_file_path}'
-                    if (relatedness_check_file_path and check == RELATEDNESS_CHECK_NAME)
-                    else ''
-                )
-                safe_post_to_slack(
-                    SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL,
-                    f'The following {len(failures)} families failed {check.replace("_", " ")} in {project}:\n{summary}{relatedness_check_message}'
-                )
+                messages.append(f"The following {len(failures)} families failed {check.replace('_', ' ')}:\n{summary}")
+                if check == RELATEDNESS_CHECK_NAME and relatedness_check_file_path:
+                    downloadable_link = f'https://storage.cloud.google.com/{relatedness_check_file_path[5:]}'
+                    messages.append(f'Relatedness check results: {downloadable_link}')
+
+            split_pdos = split_project_pdos.get(project)
+            if split_pdos:
+                messages.append(f'Skipped samples in this project have been moved to {", ".join(split_pdos)}')
+            safe_post_to_slack(
+                SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, '\n\n'.join(messages),
+            )
 
         # Reload saved variant JSON
         updated_variants_by_id = update_projects_saved_variant_json(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -44,4 +44,3 @@ def _get_remote_file_size(url):
		except Exception:
		# file size not yet implemented for FTP and other protocols, and HEAD not supported for all http requests
		return 0