Skip to content

Commit

Permalink
Merge pull request #4494 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Nov 21, 2024
2 parents d4df2c2 + 95d0bfd commit ef13b05
Show file tree
Hide file tree
Showing 58 changed files with 610 additions and 409 deletions.
3 changes: 3 additions & 0 deletions .codacy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
exclude_paths:
- "deploy/postgres/initdb.sql"
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## dev

## 11/21/24
* Migrate "Submit to Clinvar" to generic report flag for Variant Notes (REQUIRES DB MIGRATION)

## 10/28/24
* Update RNA Tissue Type choices (REQUIRES DB MIGRATION)

Expand Down
1 change: 1 addition & 0 deletions deploy/LOCAL_INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ wget https://raw.githubusercontent.com/broadinstitute/seqr/master/docker-compose
docker compose up -d seqr # start up the seqr docker image in the background after also starting other components it depends on (postgres, redis, elasticsearch). This may take 10+ minutes.
docker compose logs -f seqr # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs.

docker compose exec seqr python manage.py update_all_reference_data --use-cached-omim # Intialize reference data
docker compose exec seqr python manage.py createsuperuser # create a seqr Admin user

open http://localhost # open the seqr landing page in your browser. Log in to seqr using the email and password from the previous step
Expand Down
17 changes: 4 additions & 13 deletions deploy/docker/seqr/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,11 @@ do
fi
done

# init and populate seqrdb unless it already exists
if ! psql --host "$POSTGRES_SERVICE_HOSTNAME" -U "$POSTGRES_USERNAME" -l | grep seqrdb; then
psql --host "$POSTGRES_SERVICE_HOSTNAME" -U "$POSTGRES_USERNAME" -c 'CREATE DATABASE reference_data_db';
psql --host "$POSTGRES_SERVICE_HOSTNAME" -U "$POSTGRES_USERNAME" -c 'CREATE DATABASE seqrdb';
python -u manage.py migrate
python -u manage.py migrate --database=reference_data
python -u manage.py loaddata variant_tag_types
python -u manage.py update_all_reference_data --use-cached-omim
else
# run any pending migrations if the database already exists
python -u manage.py migrate
python -u manage.py migrate --database=reference_data
fi
# run any pending migrations and load missing data
python -u manage.py migrate
python -u manage.py migrate --database=reference_data
python -u manage.py loaddata variant_searches
python -u manage.py loaddata variant_tag_types

python -u manage.py check

Expand Down
7 changes: 1 addition & 6 deletions deploy/kubectl_helpers/shell.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,4 @@ COMPONENT=$2

POD_NAME=$("${DIR}"/utils/get_pod_name.sh "$@")

case ${COMPONENT} in
seqr) CONTAINER=seqr-pod ;;
*) CONTAINER=${COMPONENT} ;;
esac

kubectl exec -it "${POD_NAME}" -c "${CONTAINER}" -- /bin/bash
kubectl exec -it "${POD_NAME}" -c "${COMPONENT}" -- /bin/bash
2 changes: 2 additions & 0 deletions deploy/postgres/initdb.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
CREATE DATABASE seqrdb;
CREATE DATABASE reference_data_db;
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ services:
- PGPORT=5433
- POSTGRES_PASSWORD=docker-compose-postgres-password
volumes:
- ./deploy/postgres/initdb.sql:/docker-entrypoint-initdb.d/initdb.sql
- ./data/postgres:/var/lib/postgresql/data
healthcheck:
test: pg_isready -h postgres -U postgres
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.130-bea04d9c79b5
Created at 2024/10/02 14:46:35
Created at 2024/11/04 13:45:23
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
58 changes: 35 additions & 23 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,7 @@ def _apply_entry_filters(ht):
def _filter_single_entries_table(self, ht, project_families, inheritance_filter=None, quality_filter=None, is_merged_ht=False, **kwargs):
ht, sorted_family_sample_data = self._add_entry_sample_families(ht, project_families, is_merged_ht)
ht = self._filter_quality(ht, quality_filter, **kwargs)
ht, ch_ht = self._filter_inheritance(
ht, None, inheritance_filter, sorted_family_sample_data,
)
ht, ch_ht = self._filter_inheritance(ht, None, inheritance_filter, sorted_family_sample_data)
ht = self._apply_entry_filters(ht)
ch_ht = self._apply_entry_filters(ch_ht)

Expand Down Expand Up @@ -574,7 +572,7 @@ def _get_sample_type(cls, family_index, ht_globals):

def _filter_inheritance(
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data,
annotation='family_entries', entries_ht_field='family_entries'
annotation='family_entries', entries_ht_field='family_entries', **kwargs
):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

Expand All @@ -584,14 +582,14 @@ def _filter_inheritance(
any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)

ht = ht.annotate(**{
annotation: ht[entries_ht_field].map(
entries_ht_field: ht[entries_ht_field].map(
lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)
)})

if self._has_comp_het_search:
comp_het_ht = self._annotate_families_inheritance(
comp_het_ht if comp_het_ht is not None else ht, COMPOUND_HET, inheritance_filter,
sorted_family_sample_data, annotation, entries_ht_field
sorted_family_sample_data, annotation, entries_ht_field, **kwargs
)

if is_any_affected or not (inheritance_filter or self._inheritance_mode):
Expand All @@ -600,15 +598,43 @@ def _filter_inheritance(

ht = None if self._inheritance_mode == COMPOUND_HET else self._annotate_families_inheritance(
ht, self._inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field
annotation, entries_ht_field, **kwargs
)

return ht, comp_het_ht

def _annotate_families_inheritance(
self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field,
annotation, entries_ht_field, family_passes_inheritance_filter = None
):
if not family_passes_inheritance_filter:
family_passes_inheritance_filter = self._get_family_passes_inheritance_filter

entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_idx, family_samples: family_passes_inheritance_filter(
entry_indices, family_idx, genotype, family_samples, ht, annotation
)
)
})

return ht

def _get_family_passes_inheritance_filter(self, entry_indices, family_idx, genotype, family_samples, *args):
return hl.or_missing(
~entry_indices.contains(family_idx) | entry_indices[family_idx].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples)

def _get_entry_indices_by_gt_map(self, inheritance_filter, inheritance_mode, sorted_family_sample_data):
individual_genotype_filter = (inheritance_filter or {}).get('genotype')

# Create a mapping of genotypes to check against a list of samples for a family
Expand All @@ -630,21 +656,7 @@ def _annotate_families_inheritance(
]
self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht
return entry_indices_by_gt

def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
quality_filter = quality_filter or {}
Expand Down
80 changes: 59 additions & 21 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,6 @@ class MitoHailTableQuery(BaseHailTableQuery):
CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS + ['rsid']
MITO_ANNOTATION_FIELDS = {
'commonLowHeteroplasmy': lambda r: r.common_low_heteroplasmy,
'highConstraintRegion': (
lambda r: r.high_constraint_region if hasattr(r, 'high_constraint_region') else r.high_constraint_region_mito
),
'mitomapPathogenic': lambda r: r.mitomap.pathogenic,
}
BASE_ANNOTATION_FIELDS = {
Expand Down Expand Up @@ -205,37 +202,62 @@ def _filter_entries_ht_both_sample_types(
)

ch_ht = None
family_guid_idx_map = defaultdict(dict)
family_idx_map = defaultdict(dict)
for sample_type, sorted_family_sample_data in sample_types:
ht = self._annotate_initial_passes_inheritance(ht, sample_type)
ch_ht = self._annotate_initial_passes_inheritance(ch_ht, sample_type)
ht, ch_ht = self._filter_inheritance(
ht, ch_ht, inheritance_filter, sorted_family_sample_data,
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field,
family_passes_inheritance_filter=self._get_family_passes_inheritance_filter_both_sample_types
)
for family_idx, samples in enumerate(sorted_family_sample_data):
family_guid = samples[0]['familyGuid']
family_guid_idx_map[family_guid][sample_type.value] = family_idx
family_idx_map[family_guid][sample_type.value] = family_idx

family_idx_map = hl.dict(family_guid_idx_map)
family_idx_map = hl.dict(family_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_idx_map)
return ht, ch_ht

@staticmethod
def _annotate_initial_passes_inheritance(ht, sample_type):
if ht is None:
return ht

return ht.annotate(**{
sample_type.passes_inheritance_field: ht[sample_type.family_entries_field].map(
lambda family_entries: hl.array(
hl.range(0, hl.len(family_entries)).map(lambda _: True)
)
)})

def _get_family_passes_inheritance_filter_both_sample_types(
self, entry_indices, family_idx, genotype, family_samples, ht, annotation
):
return hl.enumerate(ht[annotation][family_idx]).starmap(
lambda sample_idx, passes: (hl.case()
.when(~entry_indices.get(family_idx).contains(sample_idx), passes)
.when(~self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_idx].GT), False)
.default(passes))
)

def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
if ht is None:
return ht

# Keep family from both sample types if either passes quality AND inheritance
for sample_type in SampleType:
ht = ht.annotate(**{
sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda i, family_samples: hl.or_missing(
hl.bind(
lambda other_sample_type_idx: (
self._family_has_valid_sample_type_entries(ht, sample_type, i) |
self._family_has_valid_sample_type_entries(ht, sample_type.other_sample_type, other_sample_type_idx)
),
family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
lambda family_idx, family_samples: hl.or_missing(
hl.bind(lambda other_sample_type_family_idx: ((
self._family_has_valid_quality(ht, sample_type, family_idx) |
self._family_has_valid_quality(ht, sample_type.other_sample_type, other_sample_type_family_idx)
) &
self._family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx) &
self._family_has_valid_inheritance(ht, sample_type.other_sample_type, other_sample_type_family_idx, family_idx)
), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
)})

# Merge family entries and filters from both sample types
Expand All @@ -253,13 +275,29 @@ def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
return ht.filter(ht.family_entries.any(hl.is_defined))

@staticmethod
def _family_has_valid_sample_type_entries(ht, sample_type, sample_type_family_idx):
# Note: This logic does not sufficiently handle case 2 here https://docs.google.com/presentation/d/1hqDV8ulhviUcR5C4PtNUqkCLXKDsc6pccgFVlFmWUAU/edit?usp=sharing
# and will need to be changed to support it - https://github.com/broadinstitute/seqr/issues/4403
def _family_has_valid_quality(ht, sample_type, sample_type_family_idx):
return (
hl.is_defined(sample_type_family_idx) &
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx]) &
hl.is_defined(ht[sample_type.passes_inheritance_field][sample_type_family_idx])
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx])
)

@staticmethod
def _family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx):
return hl.bind(
lambda sample_type_fail_samples, other_sample_type_pass_samples: (
sample_type_fail_samples.all(other_sample_type_pass_samples.contains)
), hl.enumerate(ht[sample_type.family_entries_field][family_idx]).starmap(
lambda sample_idx, sample: hl.or_missing(
~ht[sample_type.passes_inheritance_field][family_idx][sample_idx],
sample['sampleId'],
)
).filter(hl.is_defined),
hl.enumerate(ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx]).starmap(
lambda sample_idx, sample: hl.or_missing(
ht[sample_type.other_sample_type.passes_inheritance_field][other_sample_type_family_idx][sample_idx],
sample['sampleId'],
)
).filter(hl.is_defined),
)

def _get_sample_genotype(self, samples, r=None, include_genotype_overrides=False, select_fields=None, **kwargs):
Expand Down
8 changes: 7 additions & 1 deletion hail_search/queries/multi_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,13 @@ def _filter_data_type_comp_hets(self, variant_query, variant_families, sv_query)
for s in variant_samples_by_family[f]
] for f in overlapped_families])
sv_ht = sv_ht.annotate(family_entries=hl.enumerate(sv_sample_indices).starmap(
lambda family_i, indices: indices.map(lambda sample_i: sv_ht.family_entries[family_i][sample_i])
lambda family_i, indices: hl.bind(
lambda family_entry: hl.or_missing(
hl.is_defined(family_entry),
indices.map(lambda sample_i: family_entry[sample_i]),
),
sv_ht.family_entries[family_i],
)
))

variant_ch_ht = variant_ht.group_by('gene_ids').aggregate(v1=hl.agg.collect(variant_ht.row))
Expand Down
6 changes: 4 additions & 2 deletions hail_search/requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#
aiohappyeyeballs==2.3.5
# via aiohttp
aiohttp==3.10.2
aiohttp==3.10.11
# via pytest-aiohttp
aiosignal==1.3.1
# via aiohttp
Expand Down Expand Up @@ -34,6 +34,8 @@ packaging==23.1
# via pytest
pluggy==1.2.0
# via pytest
propcache==0.2.0
# via yarl
pytest==7.4.0
# via
# pytest-aiohttp
Expand All @@ -44,5 +46,5 @@ pytest-asyncio==0.21.0
# via pytest-aiohttp
tomli==2.0.1
# via pytest
yarl==1.9.2
yarl==1.17.2
# via aiohttp
Loading

0 comments on commit ef13b05

Please sign in to comment.