Skip to content

Commit

Permalink
Significant updates to LD computation functionality.
Browse files Browse the repository at this point in the history
  • Loading branch information
shz9 committed Oct 1, 2024
1 parent 4336efb commit 89b2aee
Show file tree
Hide file tree
Showing 13 changed files with 788 additions and 240 deletions.
13 changes: 11 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [0.1.4] - 2024-07-29
## [0.1.4] - 2024-10-01

### Changed

Expand All @@ -19,16 +19,25 @@ not work well for very large datasets with millions of variants and it causes ov
- Update `run_shell_script` to check for and capture errors.
- Refactored code to slightly reduce import/load times.
- Cleaned up `load_data` method of `LDMatrix` and subsumed functionality in `load_rows`.
- Fixed bugs in `match_snp_tables`.
- Fixed bugs and re-wrote how the `block` LD estimator is computed using both the `plink` and `xarray` backends.
- Updated `from_plink_table` method in `LDMatrix` to handle cases where boundaries are different from what
`plink` computes.

### Added

- Added extra validation checks in `LDMatrix` to ensure that the index pointer is formatted correctly.
- `LDLinearOperator` class to allow for efficient linear algebra operations on the LD matrix without
representing the full symmetric matrix in memory.
- Added utility methods to `LDMatrix` class to allow for computing eigenvalues, performing SVD, etc.
- Added `Min eigenvalue` to the attributes of LD matrices.
- Added `Spectral properties` to the attributes of LD matrices.
- Added support to slice/retrieve entries of LD matrix by using SNP rsIDs.
- Added support to reading LD matrices from AWS s3 storage.
- Added utility method to detect if a file contains header information.
- Added utility method to generate overlapping windows over a sequence.
- Added `compute_extremal_eigenvalues` to allow the user to compute extremal (minimum and maximum) eigenvalues
of LD matrices.
- Added the utility function `combine_ld_matrices` to allow for combining LD matrices from different sources.

## [0.1.3] - 2024-05-21

Expand Down
129 changes: 88 additions & 41 deletions magenpy/GenotypeMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ class GenotypeMatrix(object):
:ivar bed_file: The path to the plink BED file containing the genotype matrix.
:ivar _genome_build: The genome build or assembly under which the SNP coordinates are defined.
:ivar temp_dir: The directory where temporary files will be stored (if needed).
:ivar cleanup_dir_list: A list of directories to clean up after execution.
:ivar threads: The number of threads to use for parallel computations.
"""
Expand Down Expand Up @@ -60,17 +59,22 @@ def __init__(self,
if sample_table is not None:
self.set_sample_table(sample_table)

if snp_table is not None:
if snp_table is not None and 'original_index' not in self.snp_table.columns:
self.snp_table['original_index'] = np.arange(len(self.snp_table))

from .utils.system_utils import makedir

makedir(temp_dir)

temp_dir_prefix = 'gmat_'

if self.chromosome is not None:
temp_dir_prefix += f'chr{self.chromosome}_'

self.temp_dir = tempfile.TemporaryDirectory(dir=temp_dir, prefix=temp_dir_prefix)

self.bed_file = bed_file
self._genome_build = genome_build
self.temp_dir = temp_dir
self.cleanup_dir_list = [] # Directories to clean up after execution.

self.threads = threads

Expand Down Expand Up @@ -153,12 +157,11 @@ def genome_build(self):
@property
def chromosome(self):
"""
:return: The chromosome associated with the variants in the genotype matrix.
..note::
This is a convenience method that assumes that the genotype matrix contains variants
from a single chromosome. If there are multiple chromosomes, the method will return `None`.
:return: The chromosome associated with the variants in the genotype matrix.
"""
chrom = self.chromosomes
if chrom is not None and len(chrom) == 1:
Expand Down Expand Up @@ -377,12 +380,7 @@ def compute_ld(self,
else:
raise KeyError(f"LD estimator {estimator} is not recognized!")

# Create a temporary directory where we store intermediate results:
tmp_ld_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='ld_')
self.cleanup_dir_list.append(tmp_ld_dir)

return ld_est.compute(output_dir,
temp_dir=tmp_ld_dir.name,
dtype=dtype,
compressor_name=compressor_name,
compression_level=compression_level,
Expand Down Expand Up @@ -538,23 +536,61 @@ def split_by_chromosome(self):
return {chromosome: self}
else:
chrom_tables = self.snp_table.groupby('CHR')

return {
c: self.__class__(sample_table=self.sample_table,
snp_table=chrom_tables.get_group(c),
temp_dir=self.temp_dir)
bed_file=self.bed_file,
temp_dir=self.temp_dir.name,
genome_build=self.genome_build,
threads=self.threads)
for c in chrom_tables.groups
}

def split_by_variants(self, variant_group_dict):
"""
Split the genotype matrix by variants into separate `GenotypeMatrix` objects
based on the groups defined in `variant_group_dict`. The dictionary should have
the group name as the key and the list of SNP rsIDs in that group as the value.
:param variant_group_dict: A dictionary where the key is the group name and the value
is a list of SNP rsIDs to group together.
:return: A dictionary of `GenotypeMatrix` objects, one for each group.
"""

if isinstance(variant_group_dict, dict):

variant_group_dict = pd.concat([
pd.DataFrame({'group': group, 'SNP': snps})
for group, snps in variant_group_dict.items()
])
elif isinstance(variant_group_dict, pd.DataFrame):
assert 'SNP' in variant_group_dict.columns and 'group' in variant_group_dict.columns
else:
raise ValueError("The variant group dictionary is invalid!")

grouped_table = self.snp_table.merge(variant_group_dict, on='SNP').groupby('group')

return {
group: self.__class__(sample_table=self.sample_table,
snp_table=grouped_table.get_group(group).drop(columns='group'),
bed_file=self.bed_file,
temp_dir=self.temp_dir.name,
genome_build=self.genome_build,
threads=self.threads)
for group in grouped_table.groups
}

def cleanup(self):
"""
Clean up all temporary files and directories
"""

for tmp in self.cleanup_dir_list:
try:
tmp.cleanup()
except FileNotFoundError:
continue
try:
self.temp_dir.cleanup()
except FileNotFoundError:
pass


class xarrayGenotypeMatrix(GenotypeMatrix):
Expand Down Expand Up @@ -685,7 +721,12 @@ def filter_snps(self, extract_snps=None, extract_file=None):
"""

super().filter_snps(extract_snps=extract_snps, extract_file=extract_file)
self.xr_mat = self.xr_mat.sel(variant=np.isin(self.xr_mat.variant.coords['snp'], self.snps))

from .utils.compute_utils import intersect_arrays

idx = intersect_arrays(self.xr_mat.variant.coords['snp'].values, self.snps, return_index=True)

self.xr_mat = self.xr_mat.isel(variant=idx)

def filter_samples(self, keep_samples=None, keep_file=None):
"""
Expand Down Expand Up @@ -730,7 +771,7 @@ def score(self, beta, standardize_genotype=False, skip_na=True):
:param standardize_genotype: If True, standardize the genotype when computing the polygenic score.
:param skip_na: If True, skip missing values when computing the polygenic score.
:return: The polygenic score (PGS) for each sample in the genotype matrix.
:return: The polygenic score(s) (PGS) for each sample in the genotype matrix.
"""

Expand Down Expand Up @@ -789,6 +830,26 @@ def split_by_chromosome(self):

return split

def split_by_variants(self, variant_group_dict):
"""
Split the genotype matrix by variants into separate `xarrayGenotypeMatrix` objects
based on the groups defined in `variant_group_dict`. The dictionary should have
the group name as the key and the list of SNP rsIDs in that group as the value.
:param variant_group_dict: A dictionary where the key is the group name and the value
is a list of SNP rsIDs to group together.
:return: A dictionary of `xarrayGenotypeMatrix` objects, one for each group.
"""

split = super().split_by_variants(variant_group_dict)

for g, gt in split.items():
gt.xr_mat = self.xr_mat.copy()
gt.filter_snps(extract_snps=gt.snps)

return split


class plinkBEDGenotypeMatrix(GenotypeMatrix):
"""
Expand Down Expand Up @@ -834,6 +895,7 @@ def __init__(self,

if self.snp_table is None and self.bed_file:
self.snp_table = parse_bim_file(self.bed_file)
self.snp_table['original_index'] = np.arange(len(self.snp_table))

@classmethod
def from_file(cls, file_path, temp_dir='temp', **kwargs):
Expand Down Expand Up @@ -867,10 +929,11 @@ def score(self, beta, standardize_genotype=False):
from .stats.score.utils import score_plink2

# Create a temporary directory where we store intermediate results:
tmp_score_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='score_')
self.cleanup_dir_list.append(tmp_score_dir)
tmp_score_dir = tempfile.TemporaryDirectory(dir=self.temp_dir.name, prefix='score_')

return score_plink2(self, beta, standardize_genotype=standardize_genotype, temp_dir=tmp_score_dir.name)
return score_plink2(self, beta,
standardize_genotype=standardize_genotype,
temp_dir=tmp_score_dir.name)

def perform_gwas(self, **gwa_kwargs):
"""
Expand All @@ -884,8 +947,7 @@ def perform_gwas(self, **gwa_kwargs):
from .stats.gwa.utils import perform_gwa_plink2

# Create a temporary directory where we store intermediate results:
tmp_gwas_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='gwas_')
self.cleanup_dir_list.append(tmp_gwas_dir)
tmp_gwas_dir = tempfile.TemporaryDirectory(dir=self.temp_dir.name, prefix='gwas_')

return perform_gwa_plink2(self, temp_dir=tmp_gwas_dir.name, **gwa_kwargs)

Expand All @@ -899,8 +961,7 @@ def compute_allele_frequency(self):
from .stats.variant.utils import compute_allele_frequency_plink2

# Create a temporary directory where we store intermediate results:
tmp_freq_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='freq_')
self.cleanup_dir_list.append(tmp_freq_dir)
tmp_freq_dir = tempfile.TemporaryDirectory(dir=self.temp_dir.name, prefix='freq_')

self.snp_table['MAF'] = compute_allele_frequency_plink2(self, temp_dir=tmp_freq_dir.name)

Expand All @@ -916,20 +977,6 @@ def compute_sample_size_per_snp(self):
from .stats.variant.utils import compute_sample_size_per_snp_plink2

# Create a temporary directory where we store intermediate results:
tmp_miss_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='miss_')
self.cleanup_dir_list.append(tmp_miss_dir)
tmp_miss_dir = tempfile.TemporaryDirectory(dir=self.temp_dir.name, prefix='miss_')

self.snp_table['N'] = compute_sample_size_per_snp_plink2(self, temp_dir=tmp_miss_dir.name)

def split_by_chromosome(self):
"""
Split the genotype matrix by chromosome.
:return: A dictionary of `plinkBEDGenotypeMatrix` objects, one for each chromosome.
"""

split = super().split_by_chromosome()

for c, gt in split.items():
gt.bed_file = self.bed_file

return split
Loading

0 comments on commit 89b2aee

Please sign in to comment.