Skip to content

Commit

Permalink
Updating defaults for LD matrix construction
Browse files Browse the repository at this point in the history
  • Loading branch information
shz9 committed May 28, 2024
1 parent 9cdc274 commit e6d4481
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 43 deletions.
14 changes: 11 additions & 3 deletions bin/magenpy_ld
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,16 @@ parser.add_argument('--metadata', dest='metadata', type=str,

# Argument for the float precision:
parser.add_argument('--storage-dtype', dest='storage_dtype', type=str,
default='int16', help='The data type for the entries of the LD matrix.',
default='int8', help='The data type for the entries of the LD matrix.',
choices={'float32', 'float64', 'int16', 'int8'})

# Add arguments for the compressor:
parser.add_argument('--compressor', dest='compressor', type=str,
default='lz4', help='The compressor name or compression algorithm to use for the LD matrix.',
default='zstd', help='The compressor name or compression algorithm to use for the LD matrix.',
choices={'lz4', 'zstd', 'gzip', 'zlib'})

parser.add_argument('--compression-level', dest='compression_level', type=int,
default=5, help='The compression level to use for the entries of the LD matrix (1-9).')
default=7, help='The compression level to use for the entries of the LD matrix (1-9).')

# Options for the various LD estimators:

Expand Down Expand Up @@ -229,6 +229,7 @@ ld_mat = g.compute_ld(args.estimator,
**ld_kwargs)

# Store metadata (if provided):

if args.metadata is not None:
parsed_metadata = {
k: v for entry in args.metadata.split(',') for k, v in [entry.strip().split('=')]
Expand All @@ -239,6 +240,13 @@ if args.metadata is not None:
for k, v in parsed_metadata.items():
ld_mat.set_store_attr(k, v)

if 'Date' not in parsed_metadata:
# Store the date when the computation was done:
ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))

else:
# Store the date when the computation was done:
ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))

# Clean up all intermediate files and directories:
g.cleanup()
Expand Down
10 changes: 5 additions & 5 deletions magenpy/GWADataLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,13 +415,13 @@ def read_genotypes(self,
gmat_class = plinkBEDGenotypeMatrix

if self.verbose and len(bed_files) < 2:
print("> Reading BED file...")
print("> Reading genotype metadata...")

self.genotype = {}

for bfile in tqdm(bed_files,
total=len(bed_files),
desc="Reading BED files",
desc="Reading genotype metadata",
disable=not self.verbose or len(bed_files) < 2):
# Read BED file and update the genotypes dictionary:
self.genotype.update(gmat_class.from_file(bfile,
Expand Down Expand Up @@ -615,9 +615,9 @@ def release_ld(self):
def compute_ld(self,
estimator,
output_dir,
dtype='int16',
compressor_name='lz4',
compression_level=5,
dtype='int8',
compressor_name='zstd',
compression_level=7,
**ld_kwargs):
"""
Compute the Linkage-Disequilibrium (LD) matrix or SNP-by-SNP Pearson
Expand Down
6 changes: 3 additions & 3 deletions magenpy/GenotypeMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,9 @@ def get_snp_attribute(self, attr):
def compute_ld(self,
estimator,
output_dir,
dtype='int16',
compressor_name='lz4',
compression_level=5,
dtype='int8',
compressor_name='zstd',
compression_level=7,
**ld_kwargs):
"""
Expand Down
24 changes: 12 additions & 12 deletions magenpy/LDMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@ def from_csr(cls,
csr_mat,
store_path,
overwrite=False,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):
"""
Initialize an LDMatrix object from a sparse CSR matrix.
Expand Down Expand Up @@ -171,9 +171,9 @@ def from_plink_table(cls,
store_path,
pandas_chunksize=None,
overwrite=False,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):
"""
Construct a Zarr LD matrix using LD tables generated by plink1.9.
Expand Down Expand Up @@ -260,9 +260,9 @@ def from_dense_zarr_matrix(cls,
store_path,
overwrite=False,
delete_original=False,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):
"""
Initialize a new LD matrix object using a Zarr array object. This method is
useful for converting a dense LD matrix computed using Dask (or other distributed computing
Expand Down Expand Up @@ -359,9 +359,9 @@ def from_ragged_zarr_matrix(cls,
store_path,
overwrite=False,
delete_original=False,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):
"""
Initialize a new LD matrix object using a Zarr array object
conforming to the old LD Matrix format from magenpy v<=0.0.12.
Expand Down
24 changes: 12 additions & 12 deletions magenpy/stats/ld/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ def compute(self,
temp_dir='temp',
overwrite=True,
delete_original=True,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):
"""
A utility method to compute the LD matrix and store in Zarr array format.
The computes the LD matrix and stores it in Zarr array format, set its attributes,
Expand Down Expand Up @@ -238,9 +238,9 @@ def compute(self,
temp_dir='temp',
overwrite=True,
delete_original=True,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):
"""
Compute the windowed LD matrix and store in Zarr array format.
Expand Down Expand Up @@ -346,9 +346,9 @@ def compute(self,
temp_dir='temp',
overwrite=True,
delete_original=True,
dtype='int16',
compressor_name='lz4',
compression_level=5,
dtype='int8',
compressor_name='zstd',
compression_level=7,
chunk_size=1000):
"""
Expand Down Expand Up @@ -465,9 +465,9 @@ def compute(self,
temp_dir='temp',
overwrite=True,
delete_original=True,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):
"""
Compute the block-based LD matrix and store in Zarr array format.
Expand Down
16 changes: 8 additions & 8 deletions magenpy/stats/ld/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def harmonic_series_sum(n):
return ld_mat_obj


def estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128):
def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128):
"""
Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB.
The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory.
Expand All @@ -255,9 +255,9 @@ def compute_ld_plink1p9(genotype_matrix,
output_dir,
temp_dir='temp',
overwrite=True,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):

"""
Compute LD matrices using plink 1.9.
Expand Down Expand Up @@ -354,7 +354,7 @@ def compute_ld_plink1p9(genotype_matrix,
plink1.execute(cmd)

# Convert from PLINK LD files to Zarr:
fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + str(genotype_matrix.chromosome))
fin_ld_store = osp.join(output_dir, 'chr_' + str(genotype_matrix.chromosome))

# Compute the pandas chunk_size
# The goal of this is to process chunks of the LD table without overwhelming memory resources:
Expand Down Expand Up @@ -382,9 +382,9 @@ def compute_ld_xarray(genotype_matrix,
temp_dir='temp',
overwrite=True,
delete_original=True,
dtype='int16',
compressor_name='lz4',
compression_level=5):
dtype='int8',
compressor_name='zstd',
compression_level=7):

"""
Compute the Linkage Disequilibrium matrix or snp-by-snp
Expand Down

0 comments on commit e6d4481

Please sign in to comment.