Updating defaults for LD matrix construction

shz9 · May 28, 2024 · e6d4481 · e6d4481
1 parent 9cdc274
commit e6d4481
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 43 deletions.
diff --git a/bin/magenpy_ld b/bin/magenpy_ld
@@ -82,16 +82,16 @@ parser.add_argument('--metadata', dest='metadata', type=str,
 
 # Argument for the float precision:
 parser.add_argument('--storage-dtype', dest='storage_dtype', type=str,
-                    default='int16', help='The data type for the entries of the LD matrix.',
+                    default='int8', help='The data type for the entries of the LD matrix.',
                     choices={'float32', 'float64', 'int16', 'int8'})
 
 # Add arguments for the compressor:
 parser.add_argument('--compressor', dest='compressor', type=str,
-                    default='lz4', help='The compressor name or compression algorithm to use for the LD matrix.',
+                    default='zstd', help='The compressor name or compression algorithm to use for the LD matrix.',
                     choices={'lz4', 'zstd', 'gzip', 'zlib'})
 
 parser.add_argument('--compression-level', dest='compression_level', type=int,
-                    default=5, help='The compression level to use for the entries of the LD matrix (1-9).')
+                    default=7, help='The compression level to use for the entries of the LD matrix (1-9).')
 
 # Options for the various LD estimators:
 
@@ -229,6 +229,7 @@ ld_mat = g.compute_ld(args.estimator,
                       **ld_kwargs)
 
 # Store metadata (if provided):
+
 if args.metadata is not None:
     parsed_metadata = {
         k: v for entry in args.metadata.split(',') for k, v in [entry.strip().split('=')]
@@ -239,6 +240,13 @@ if args.metadata is not None:
         for k, v in parsed_metadata.items():
             ld_mat.set_store_attr(k, v)
 
+    if 'Date' not in parsed_metadata:
+        # Store the date when the computation was done:
+        ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))
+
+else:
+    # Store the date when the computation was done:
+    ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))
 
 # Clean up all intermediate files and directories:
 g.cleanup()

diff --git a/magenpy/GWADataLoader.py b/magenpy/GWADataLoader.py
@@ -415,13 +415,13 @@ def read_genotypes(self,
             gmat_class = plinkBEDGenotypeMatrix
 
         if self.verbose and len(bed_files) < 2:
-            print("> Reading BED file...")
+            print("> Reading genotype metadata...")
 
         self.genotype = {}
 
         for bfile in tqdm(bed_files,
                           total=len(bed_files),
-                          desc="Reading BED files",
+                          desc="Reading genotype metadata",
                           disable=not self.verbose or len(bed_files) < 2):
             # Read BED file and update the genotypes dictionary:
             self.genotype.update(gmat_class.from_file(bfile,
@@ -615,9 +615,9 @@ def release_ld(self):
     def compute_ld(self,
                    estimator,
                    output_dir,
-                   dtype='int16',
-                   compressor_name='lz4',
-                   compression_level=5,
+                   dtype='int8',
+                   compressor_name='zstd',
+                   compression_level=7,
                    **ld_kwargs):
         """
         Compute the Linkage-Disequilibrium (LD) matrix or SNP-by-SNP Pearson

diff --git a/magenpy/GenotypeMatrix.py b/magenpy/GenotypeMatrix.py
@@ -341,9 +341,9 @@ def get_snp_attribute(self, attr):
     def compute_ld(self,
                    estimator,
                    output_dir,
-                   dtype='int16',
-                   compressor_name='lz4',
-                   compression_level=5,
+                   dtype='int8',
+                   compressor_name='zstd',
+                   compression_level=7,
                    **ld_kwargs):
         """
 

diff --git a/magenpy/LDMatrix.py b/magenpy/LDMatrix.py
@@ -116,9 +116,9 @@ def from_csr(cls,
                  csr_mat,
                  store_path,
                  overwrite=False,
-                 dtype='int16',
-                 compressor_name='lz4',
-                 compression_level=5):
+                 dtype='int8',
+                 compressor_name='zstd',
+                 compression_level=7):
         """
         Initialize an LDMatrix object from a sparse CSR matrix.
 
@@ -171,9 +171,9 @@ def from_plink_table(cls,
                          store_path,
                          pandas_chunksize=None,
                          overwrite=False,
-                         dtype='int16',
-                         compressor_name='lz4',
-                         compression_level=5):
+                         dtype='int8',
+                         compressor_name='zstd',
+                         compression_level=7):
         """
         Construct a Zarr LD matrix using LD tables generated by plink1.9.
 
@@ -260,9 +260,9 @@ def from_dense_zarr_matrix(cls,
                                store_path,
                                overwrite=False,
                                delete_original=False,
-                               dtype='int16',
-                               compressor_name='lz4',
-                               compression_level=5):
+                               dtype='int8',
+                               compressor_name='zstd',
+                               compression_level=7):
         """
          Initialize a new LD matrix object using a Zarr array object. This method is
          useful for converting a dense LD matrix computed using Dask (or other distributed computing
@@ -359,9 +359,9 @@ def from_ragged_zarr_matrix(cls,
                                 store_path,
                                 overwrite=False,
                                 delete_original=False,
-                                dtype='int16',
-                                compressor_name='lz4',
-                                compression_level=5):
+                                dtype='int8',
+                                compressor_name='zstd',
+                                compression_level=7):
         """
         Initialize a new LD matrix object using a Zarr array object
         conforming to the old LD Matrix format from magenpy v<=0.0.12.

diff --git a/magenpy/stats/ld/estimator.py b/magenpy/stats/ld/estimator.py
@@ -61,9 +61,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5):
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7):
         """
         A utility method to compute the LD matrix and store in Zarr array format.
         The computes the LD matrix and stores it in Zarr array format, set its attributes,
@@ -238,9 +238,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5):
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7):
         """
 
         Compute the windowed LD matrix and store in Zarr array format.
@@ -346,9 +346,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5,
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7,
                 chunk_size=1000):
         """
 
@@ -465,9 +465,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5):
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7):
         """
 
         Compute the block-based LD matrix and store in Zarr array format.

diff --git a/magenpy/stats/ld/utils.py b/magenpy/stats/ld/utils.py
@@ -233,7 +233,7 @@ def harmonic_series_sum(n):
     return ld_mat_obj
 
 
-def estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128):
+def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128):
     """
     Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB.
     The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory.
@@ -255,9 +255,9 @@ def compute_ld_plink1p9(genotype_matrix,
                         output_dir,
                         temp_dir='temp',
                         overwrite=True,
-                        dtype='int16',
-                        compressor_name='lz4',
-                        compression_level=5):
+                        dtype='int8',
+                        compressor_name='zstd',
+                        compression_level=7):
 
     """
     Compute LD matrices using plink 1.9.
@@ -354,7 +354,7 @@ def compute_ld_plink1p9(genotype_matrix,
     plink1.execute(cmd)
 
     # Convert from PLINK LD files to Zarr:
-    fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + str(genotype_matrix.chromosome))
+    fin_ld_store = osp.join(output_dir, 'chr_' + str(genotype_matrix.chromosome))
 
     # Compute the pandas chunk_size
     # The goal of this is to process chunks of the LD table without overwhelming memory resources:
@@ -382,9 +382,9 @@ def compute_ld_xarray(genotype_matrix,
                       temp_dir='temp',
                       overwrite=True,
                       delete_original=True,
-                      dtype='int16',
-                      compressor_name='lz4',
-                      compression_level=5):
+                      dtype='int8',
+                      compressor_name='zstd',
+                      compression_level=7):
 
     """
     Compute the Linkage Disequilibrium matrix or snp-by-snp