Skip to content

Commit

Permalink
Merge pull request #4561 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Dec 20, 2024
2 parents 8383240 + 32327f0 commit 2bc28c2
Show file tree
Hide file tree
Showing 48 changed files with 472 additions and 119 deletions.
9 changes: 8 additions & 1 deletion .github/workflows/vlm-unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ on:
paths:
- 'vlm/**'
- '.github/workflows/*vlm*.yaml'
- 'hail_search/fixtures/*'
pull_request:
types: [opened, synchronize, reopened]
paths:
- 'vlm/**'
- '.github/workflows/*vlm*.yaml'
- 'hail_search/fixtures/*'

jobs:
vlm:
Expand All @@ -29,5 +31,10 @@ jobs:
pip install -r hail_search/requirements-test.txt
- name: Run coverage tests
run: |
export VLM_DATA_DIR=./hail_search/fixtures
export SEQR_BASE_URL=https://test-seqr.org/
export NODE_ID=TestVLM
export MACHINE_MEM=24
export JAVA_OPTS_XSS=16M
coverage run --source="./vlm" --omit="./vlm/__main__.py" -m pytest vlm/
coverage report --fail-under=90
coverage report --fail-under=95
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.133-4c60fddb171a
Created at 2024/12/04 13:07:33
Written with version 0.2.128-eead8100a1c1
Created at 2024/12/05 16:23:46
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
47 changes: 26 additions & 21 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,7 @@ def passes_quality_field(gt):
def _passes_vcf_filters(gt):
return hl.is_missing(gt.filters) | (gt.filters.length() < 1)

def _parse_variant_keys(self, variant_keys=None, **kwargs):
def _parse_variant_keys(self, variant_keys):
return [hl.struct(**{self.KEY_FIELD[0]: key}) for key in (variant_keys or [])]

def _prefilter_entries_table(self, ht, **kwargs):
Expand Down Expand Up @@ -728,12 +728,15 @@ def _filter_rs_ids(self, ht, rs_ids):
rs_id_set = hl.set(rs_ids)
return ht.filter(rs_id_set.contains(ht.rsid))

def _parse_intervals(self, intervals, gene_ids=None, **kwargs):
parsed_variant_keys = self._parse_variant_keys(**kwargs)
def _parse_intervals(self, intervals, gene_ids=None, variant_keys=None, variant_ids=None, **kwargs):
parsed_variant_keys = self._parse_variant_keys(variant_keys)
if parsed_variant_keys:
self._load_table_kwargs['variant_ht'] = hl.Table.parallelize(parsed_variant_keys).key_by(*self.KEY_FIELD)
return intervals

if variant_ids:
intervals = [(chrom, pos, pos+1) for chrom, pos, _, _ in variant_ids]

is_x_linked = self._inheritance_mode == X_LINKED_RECESSIVE
if not (intervals or is_x_linked):
return intervals
Expand Down Expand Up @@ -1216,34 +1219,36 @@ def gene_counts(self):
ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families))
))

def lookup_variants(self, variant_ids, include_project_data=False, **kwargs):
def _filter_variant_ids(self, ht, variant_ids):
return ht

def lookup_variants(self, variant_ids):
self._parse_intervals(intervals=None, variant_ids=variant_ids, variant_keys=variant_ids)
ht = self._read_table('annotations.ht', drop_globals=['versions'])
ht = self._filter_variant_ids(ht, variant_ids)
ht = ht.filter(hl.is_defined(ht[XPOS]))

annotation_fields = self.annotation_fields(include_genotype_overrides=False)
include_sample_annotations = False
if include_project_data:
ht, include_sample_annotations = self._add_project_lookup_data(ht, annotation_fields, **kwargs)
if not include_sample_annotations:
annotation_fields = {
k: v for k, v in annotation_fields.items()
if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD}
}

annotation_fields = {
k: v for k, v in self.annotation_fields(include_genotype_overrides=False).items()
if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD}
}
formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False)

return formatted.aggregate(hl.agg.take(formatted.row, len(variant_ids)))

def _add_project_lookup_data(self, ht, annotation_fields, include_sample_annotations=False, project_samples=None, **kwargs):
if project_samples:
projects_ht, _ = self._import_and_filter_multiple_project_hts(project_samples, n_partitions=1)
ht = ht.annotate(**projects_ht[ht.key])
def _import_variant_projects_ht(self, variant_id, project_samples=None, **kwargs):
projects_ht, _ = self._import_and_filter_multiple_project_hts(project_samples, n_partitions=1)
return self._filter_variant_ids(projects_ht, [variant_id]).key_by()

return ht, include_sample_annotations
def _get_variant_project_data(self, variant_id, **kwargs):
projects_ht = self._import_variant_projects_ht(variant_id, **kwargs)
project_data = projects_ht.aggregate(hl.agg.take(projects_ht.row, 1))
return project_data[0] if project_data else {}

def lookup_variant(self, variant_id, **kwargs):
variants = self.lookup_variants([variant_id], include_project_data=True, **kwargs)
variants = self.lookup_variants([variant_id])
if not variants:
raise HTTPNotFound()
return dict(variants[0])
variant = dict(variants[0])
variant.update(self._get_variant_project_data(variant_id, **kwargs))
return variant
38 changes: 16 additions & 22 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,30 +394,25 @@ def _filter_variant_ids(self, ht, variant_ids):
variant_id_q = ht.alleles == [variant_ids[0][2], variant_ids[0][3]]
else:
variant_id_q = hl.any([
(ht.locus == hl.locus(chrom, pos, reference_genome=self.GENOME_VERSION)) &
(ht.locus == hl.locus(f'chr{chrom}' if self._should_add_chr_prefix() else chrom, pos, reference_genome=self.GENOME_VERSION)) &
(ht.alleles == [ref, alt])
for chrom, pos, ref, alt in variant_ids
])
return ht.filter(variant_id_q)

def _parse_variant_keys(self, variant_ids=None, **kwargs):
if not variant_ids:
return variant_ids
def _parse_variant_keys(self, variant_keys):
return None

return [
hl.struct(
locus=hl.locus(f'chr{chrom}' if self._should_add_chr_prefix() else chrom, pos, reference_genome=self.GENOME_VERSION),
alleles=[ref, alt],
) for chrom, pos, ref, alt in variant_ids
]

def _prefilter_entries_table(self, ht, parsed_intervals=None, exclude_intervals=False, **kwargs):
def _prefilter_entries_table(self, ht, parsed_intervals=None, exclude_intervals=False, variant_ids=None, **kwargs):
num_intervals = len(parsed_intervals or [])
if exclude_intervals and parsed_intervals:
ht = hl.filter_intervals(ht, parsed_intervals, keep=False)
elif num_intervals >= MAX_LOAD_INTERVALS:
ht = hl.filter_intervals(ht, parsed_intervals)

if variant_ids:
ht = self._filter_variant_ids(ht, variant_ids)

if '_n_partitions' not in self._load_table_kwargs and num_intervals > self._n_partitions:
ht = ht.naive_coalesce(self._n_partitions)

Expand Down Expand Up @@ -513,10 +508,11 @@ def _omim_sort(cls, r, omim_gene_set):
def _gene_rank_sort(cls, r, gene_ranks):
return [gene_ranks.get(r.selected_transcript.gene_id)] + super()._gene_rank_sort(r, gene_ranks)

def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
def _import_variant_projects_ht(self, variant_id, *args, **kwargs):
# Get all the project-families for the looked up variant formatted as a dict of dicts:
# {<project_guid>: {<sample_type>: {<family_guid>: True}, <sample_type_2>: {<family_guid_2>: True}}, <project_guid_2>: ...}
lookup_ht = self._read_table('lookup.ht', skip_missing_field='project_stats')
lookup_ht = self._filter_variant_ids(lookup_ht, [variant_id])
if lookup_ht is None:
raise HTTPNotFound()
variant_projects = lookup_ht.aggregate(hl.agg.take(
Expand All @@ -536,22 +532,20 @@ def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
lambda project_data: hl.dict(project_data.starmap(
lambda project_key, families: (project_key[1], families)
)))), 1)
)[0]
)

# Variant can be present in the lookup table with only ref calls, so is still not present in any projects
if not variant_projects:
if not (variant_projects and variant_projects[0]):
raise HTTPNotFound()
variant_projects = variant_projects[0]

self._has_both_sample_types = True
annotation_fields.update({
'familyGenotypes': lambda r: hl.dict(r.family_entries.map(
lambda entries: (entries.first().familyGuid, self._get_sample_genotype(entries.filter(hl.is_defined)))
)),
})

logger.info(f'Looking up {self.DATA_TYPE} variant in {len(variant_projects)} projects')

return super()._add_project_lookup_data(ht, annotation_fields, project_samples=variant_projects, **kwargs)
projects_ht = super()._import_variant_projects_ht(variant_id, project_samples=variant_projects)
return projects_ht.select(familyGenotypes=hl.dict(projects_ht.family_entries.map(
lambda entries: (entries.first().familyGuid, self._get_sample_genotype(entries.filter(hl.is_defined)))
)))

@staticmethod
def _stat_has_non_ref(s):
Expand Down
12 changes: 7 additions & 5 deletions hail_search/queries/sv.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import hail as hl


from hail_search.constants import CONSEQUENCE_SORT, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD
from hail_search.constants import CONSEQUENCE_SORT, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD, FAMILY_GUID_FIELD, \
GENOTYPES_FIELD
from hail_search.queries.base import BaseHailTableQuery, PredictionPath


Expand Down Expand Up @@ -132,8 +133,9 @@ def _additional_annotation_fields(self):
)),
}

def _add_project_lookup_data(self, *args, sample_data=None, **kwargs):
def _import_variant_projects_ht(self, variant_id, sample_data=None, **kwargs):
project_samples, _ = self._parse_sample_data(sample_data)
return super()._add_project_lookup_data(
*args, include_sample_annotations=True, project_samples=project_samples, **kwargs,
)
projects_ht = super()._import_variant_projects_ht(variant_id, project_samples=project_samples)

annotation_fields = self.annotation_fields(include_genotype_overrides=False)
return projects_ht.select(**{k: annotation_fields[k](projects_ht) for k in [FAMILY_GUID_FIELD, GENOTYPES_FIELD]})
4 changes: 2 additions & 2 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ async def test_location_search(self):

await self._assert_expected_search(
[SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_data_type='SV_WES',
intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1]
intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][1:]
)

await self._assert_expected_search(
Expand Down Expand Up @@ -900,7 +900,7 @@ async def test_annotations_filter(self):

await self._assert_expected_search(
[SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT],
gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_data_type='SV_WES',
gene_ids=LOCATION_SEARCH['gene_ids'][1:], annotations=annotations, omit_data_type='SV_WES',
)

annotations['other'] = annotations['other'][:1]
Expand Down
2 changes: 1 addition & 1 deletion hail_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,7 +1034,7 @@
}

LOCATION_SEARCH = {
'gene_ids': ['ENSG00000177000', 'ENSG00000097046'],
'gene_ids': ['ENSG00000097046', 'ENSG00000177000'],
'intervals': [['2', 1234, 5678], ['7', 1, 11100], ['1', 11785723, 11806455], ['1', 91500851, 91525764]],
}
EXCLUDE_LOCATION_SEARCH = {'intervals': LOCATION_SEARCH['intervals'], 'exclude_intervals': True}
Expand Down
1 change: 0 additions & 1 deletion reference_data/management/commands/utils/download_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,3 @@ def _get_remote_file_size(url):
except Exception:
# file size not yet implemented for FTP and other protocols, and HEAD not supported for all http requests
return 0

3 changes: 1 addition & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ click==8.1.3
# via pip-tools
coverage==5.1
# via -r requirements-dev.in
django==4.2.16
django==4.2.17
# via
# -c requirements.txt
# django-appconf
Expand Down Expand Up @@ -83,4 +83,3 @@ wheel==0.38.4
# The following packages are considered to be unsafe in a requirements file:
# pip
# setuptools
zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ defusedxml==0.7.1
# via
# python3-openid
# social-auth-core
django==4.2.16
django==4.2.17
# via
# -r requirements.in
# django-anymail
Expand Down Expand Up @@ -182,3 +182,4 @@ urllib3==1.26.19
# requests
whitenoise==6.3.0
# via -r requirements.in
zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
32 changes: 16 additions & 16 deletions seqr/management/commands/check_for_new_samples_from_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,28 +166,28 @@ def _load_new_samples(cls, metadata_path, genome_version, dataset_type, run_vers
failed_families_by_guid = {f['guid']: f for f in Family.objects.filter(
guid__in={family for families in failed_family_samples.values() for family in families}
).values('guid', 'family_id', 'project__name')}
failures_by_project_check = defaultdict(lambda: defaultdict(list))
for check, check_failures in failed_family_samples.items():
failures_by_project = defaultdict(list)
for family_guid, failure_data in check_failures.items():
family = failed_families_by_guid[family_guid]
failures_by_project[family['project__name']].append(
failures_by_project_check[family['project__name']][check].append(
f'- {family["family_id"]}: {"; ".join(failure_data["reasons"])}'
)
for project, failures in failures_by_project.items():
for project, failures_by_check in failures_by_project_check.items():
messages = [f'Encountered the following errors loading {project}:']
for check, failures in failures_by_check.items():
summary = '\n'.join(sorted(failures))
split_pdos = split_project_pdos.get(project)
if split_pdos:
summary += f'\n\nSkipped samples in this project have been moved to {", ".join(split_pdos)}'

relatedness_check_message = (
f'\nRelatedness check results: {relatedness_check_file_path}'
if (relatedness_check_file_path and check == RELATEDNESS_CHECK_NAME)
else ''
)
safe_post_to_slack(
SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL,
f'The following {len(failures)} families failed {check.replace("_", " ")} in {project}:\n{summary}{relatedness_check_message}'
)
messages.append(f"The following {len(failures)} families failed {check.replace('_', ' ')}:\n{summary}")
if check == RELATEDNESS_CHECK_NAME and relatedness_check_file_path:
downloadable_link = f'https://storage.cloud.google.com/{relatedness_check_file_path[5:]}'
messages.append(f'Relatedness check results: {downloadable_link}')

split_pdos = split_project_pdos.get(project)
if split_pdos:
messages.append(f'Skipped samples in this project have been moved to {", ".join(split_pdos)}')
safe_post_to_slack(
SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, '\n\n'.join(messages),
)

# Reload saved variant JSON
updated_variants_by_id = update_projects_saved_variant_json(
Expand Down
Loading

0 comments on commit 2bc28c2

Please sign in to comment.