Skip to content

Commit

Permalink
Merge pull request #4281 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jklugherz authored Aug 14, 2024
2 parents ed7a173 + a871043 commit 8b6f16d
Show file tree
Hide file tree
Showing 250 changed files with 198 additions and 166 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## dev

## 8/9/24
* Update directory structure for search backend

## 8/2/24
* Adds index_file_path to IGV Sample model (REQUIRES DB MIGRATION)

Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/08/07 16:01:38
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
32 changes: 23 additions & 9 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,10 @@ def _query_table_annotations(ht, query_table_path):

def _parse_sample_data(self, sample_data):
families = set()
project_samples = defaultdict(lambda: defaultdict(list))
project_samples = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for s in sample_data:
families.add(s['family_guid'])
project_samples[s['project_guid']][s['family_guid']].append(s)
project_samples[s['project_guid']][s['family_guid']][s['sample_type']].append(s)

num_families = len(families)
logger.info(f'Loading {self.DATA_TYPE} data for {num_families} families in {len(project_samples)} projects')
Expand All @@ -296,7 +296,13 @@ def _parse_sample_data(self, sample_data):
def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, n_partitions=MAX_PARTITIONS, **kwargs):
if len(project_samples) == 1:
project_guid = list(project_samples.keys())[0]
project_ht = self._read_table(f'projects/{project_guid}.ht', use_ssd_dir=True)
# for variant lookup, project_samples looks like
# {<project_guid>: {<family_guid>: {<sample_type>: True}, {<family_guid>: {<sample_type_2>: True}}, <project_guid_2>: ...}
# for variant search, project_samples looks like
# {<project_guid>: {<family_guid>: {<sample_type>: [<sample_data>, <sample_data>, ...], <sample_type_2>: ...}, <family_guid_2>: ...}, <project_guid_2>: ...}
first_family_samples = list(project_samples[project_guid].values())[0]
sample_type = list(first_family_samples.keys())[0]
project_ht = self._read_table(f'projects/{sample_type}/{project_guid}.ht', use_ssd_dir=True)
return self._filter_entries_table(project_ht, project_samples[project_guid], **kwargs)

# Need to chunk tables or else evaluating table globals throws LineTooLong exception
Expand All @@ -308,11 +314,10 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, n_
project_hts = []
sample_data = {}
for project_guid, project_sample_data in project_samples.items():
project_ht = self._read_table(
f'projects/{project_guid}.ht',
use_ssd_dir=True,
skip_missing_field='family_entries' if skip_all_missing else None,
)
first_family_samples = list(project_sample_data.values())[0]
sample_type = list(first_family_samples.keys())[0]
project_ht = self._read_table(f'projects/{sample_type}/{project_guid}.ht', use_ssd_dir=True)

if project_ht is None:
continue
project_hts.append(project_ht.select_globals('sample_type', 'family_guids', 'family_samples'))
Expand All @@ -338,7 +343,8 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, *
if num_families == 1:
family_sample_data = list(project_samples.values())[0]
family_guid = list(family_sample_data.keys())[0]
family_ht = self._read_table(f'families/{family_guid}.ht', use_ssd_dir=True)
sample_type = list(family_sample_data[family_guid].keys())[0]
family_ht = self._read_table(f'families/{sample_type}/{family_guid}.ht', use_ssd_dir=True)
family_ht = family_ht.transmute(family_entries=[family_ht.entries])
family_ht = family_ht.annotate_globals(
family_guids=[family_guid], family_samples={family_guid: family_ht.sample_ids},
Expand Down Expand Up @@ -393,6 +399,14 @@ def _merge_project_hts(project_hts, n_partitions, include_all_globals=False):
def _filter_entries_table(self, ht, sample_data, inheritance_filter=None, quality_filter=None, **kwargs):
ht = self._prefilter_entries_table(ht, **kwargs)

# Temporarily reset sample_data until full blended eS/GS support is added
for family_guid, samples_by_sample_type in sample_data.items():
if isinstance(list(samples_by_sample_type.values())[0], list):
samples = [s for samples in samples_by_sample_type.values() for s in samples]
sample_data[family_guid] = samples
else:
sample_data[family_guid] = True

ht, sorted_family_sample_data = self._add_entry_sample_families(ht, sample_data)

passes_quality_filter = self._get_family_passes_quality_filter(quality_filter, ht, **kwargs)
Expand Down
13 changes: 12 additions & 1 deletion hail_search/queries/mito.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

from aiohttp.web import HTTPNotFound
import hail as hl
import logging
Expand Down Expand Up @@ -308,7 +310,7 @@ def _gene_rank_sort(cls, r, gene_ranks):

def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
# Get all the project-families for the looked up variant formatted as a dict of dicts:
# {<project_guid>: {<family_guid>: True, <family_guid_2>: True}, <project_guid_2>: ...}
# {<project_guid>: {<family_guid>: {<sample_type>: True}, {<family_guid_2>: {<sample_type_2>: True}}, <project_guid_2>: ...}
lookup_ht = self._read_table('lookup.ht', use_ssd_dir=True, skip_missing_field='project_stats')
if lookup_ht is None:
raise HTTPNotFound()
Expand All @@ -325,6 +327,15 @@ def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
hl.dict(family_indices.map(lambda j: (lookup_ht.project_families[project_guid][j], True))),
))), 1),
)[0]

for project_guid, families in variant_projects.items():
if os.path.exists(self._get_table_path(f'projects/WES/{project_guid}.ht')):
sample_type = 'WES'
else:
sample_type = 'WGS'
for family_guid in families:
families[family_guid] = {sample_type: True}

# Variant can be present in the lookup table with only ref calls, so is still not present in any projects
if not variant_projects:
raise HTTPNotFound()
Expand Down
Loading

0 comments on commit 8b6f16d

Please sign in to comment.