MERGE #19

dieterich-lab · May 22, 2023 · d18ec22 · d18ec22
2 parents a5366fd + e08adb3
commit d18ec22
Show file tree

Hide file tree

Showing 22 changed files with 17 additions and 68 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ repos:
       - id: check-toml
 
   - repo: https://github.com/psf/black
-    rev: 22.12.0
+    rev: 23.3.0
     hooks:
       - id: black
       - id: black-jupyter
@@ -21,15 +21,15 @@ repos:
       - id: nbstripout
 
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.0-alpha.0
+    rev: v3.0.0-alpha.6
     hooks:
       - id: prettier
         additional_dependencies:
           - prettier@2.3.2
           - "prettier-plugin-toml"
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.19.2
+    rev: 0.22.0
     hooks:
       - id: check-github-workflows
       - id: check-readthedocs

diff --git a/src/pbiotools/misc/dask_utils.py b/src/pbiotools/misc/dask_utils.py
@@ -43,7 +43,6 @@ def connect(args):
     cluster = None
 
     if args.cluster_location == "LOCAL":
-
         msg = "[dask_utils]: starting local dask cluster"
         logger.info(msg)
 
@@ -73,7 +72,6 @@ def connect(args):
 def add_dask_options(
     parser, num_cpus=1, num_threads_per_cpu=1, cluster_location="LOCAL"
 ):
-
     """Add options for connecting to and/or controlling a local dask cluster
 
     Parameters
@@ -131,7 +129,6 @@ def add_dask_values_to_args(
     cluster_location="LOCAL",
     client_restart=False,
 ):
-
     """Add the options for a dask cluster to the given argparse namespace
 
     This function is mostly intended as a helper for use in ipython notebooks.

diff --git a/src/pbiotools/misc/logging_utils.py b/src/pbiotools/misc/logging_utils.py
@@ -130,7 +130,6 @@ def get_logging_options_string(args):
 def update_logging(
     args, logger=None, format_str="%(levelname)-8s %(name)-8s %(asctime)s : %(message)s"
 ):
-
     """This function interprets the logging options in args. Presumably, these
         were added to an argument parser using add_logging_options.
 
@@ -197,7 +196,6 @@ def update_logging(
 def get_ipython_logger(
     logging_level="DEBUG", format_str="%(levelname)-8s : %(message)s"
 ):
-
     level = logging.getLevelName(logging_level)
     formatter = logging.Formatter(format_str)
 

diff --git a/src/pbiotools/misc/math_utils.py b/src/pbiotools/misc/math_utils.py
@@ -329,7 +329,6 @@ def check_range(
     raise_on_invalid=True,
     logger=logger,
 ):
-
     """This function checks whether the given value falls within the
     specified range. If not, either an exception is raised or a
     warning is logged.
@@ -606,7 +605,6 @@ def matrix_multiply(m1, m2, m3):
 
 
 def fit_bayesian_gaussian_mixture(X, n_components=100, seed=8675309, **kwargs):
-
     """Fit a sklearn.mixture.BayesianGaussianMixture with the parameters.
 
     This function is mostly used to give slightly more reasonable defaults for

diff --git a/src/pbiotools/misc/mpl_utils.py b/src/pbiotools/misc/mpl_utils.py
@@ -329,7 +329,6 @@ def plot_roc_curve(
     xlabel="False positive rate",
     ylabel="True positive rate",
 ):
-
     import numpy as np
     import matplotlib.pyplot as plt
     import matplotlib.colors
@@ -407,7 +406,6 @@ def plot_confusion_matrix(
     predicted_tick_rotation=None,
     out=None,
 ):
-
     """Plot the given confusion matrix"""
     if ax is None:
         fig, ax = plt.subplots()
@@ -456,7 +454,6 @@ def plot_confusion_matrix(
         s = confusion_matrix.shape
         it = itertools.product(range(s[0]), range(s[1]))
         for i, j in it:
-
             val = confusion_matrix[i, j]
             cell_color = cmap(mappable.norm(val))
 
@@ -498,7 +495,6 @@ def plot_venn_diagram(
     counts_fontsize=12,
     sci_notation_limit=999,
 ):
-
     """This function is a wrapper around matplotlib_venn. It most just makes
     setting the fonts and and label formatting a bit easier.
 
@@ -592,7 +588,6 @@ def create_simple_bar_chart(
     legend_fontsize=12,
     title_fontsize=12,
 ):
-
     import numpy as np
     import matplotlib.colors
     import matplotlib.pyplot as plt

diff --git a/src/pbiotools/misc/pandas_utils.py b/src/pbiotools/misc/pandas_utils.py
@@ -188,7 +188,7 @@ def write_df(
     filetype="AUTO",
     sheet="Sheet_1",
     do_not_compress=False,
-    **kwargs
+    **kwargs,
 ):
     """This function writes a data frame to a file of the specified type.
     Unless otherwise specified, csv files are gzipped when written. By

diff --git a/src/pbiotools/misc/parallel.py b/src/pbiotools/misc/parallel.py
@@ -15,7 +15,7 @@ def apply_parallel_iter(
     progress_bar=False,
     total=None,
     num_groups=None,
-    backend="loky"
+    backend="loky",
 ):
     """This function parallelizes applying a function to all items in an iterator using the
     joblib library. In particular, func is called for each of the items in the list. (Unless
@@ -171,7 +171,7 @@ def apply_parallel_split(
     *args,
     progress_bar=False,
     num_groups=None,
-    backend="loky"
+    backend="loky",
 ):
     """This function parallelizes applying a function to the rows of a data frame using the
     joblib library. The data frame is first split into num_procs equal-sized groups, and
@@ -343,5 +343,5 @@ def apply_iter_simple(
         progress_bar=progress_bar,
         total=total,
         num_groups=num_groups,
-        backend=backend
+        backend=backend,
     )
diff --git a/src/pbiotools/misc/shell_utils.py b/src/pbiotools/misc/shell_utils.py
@@ -125,7 +125,6 @@ def download_file(url, local_filename=None, chunk_size=1024, overwrite=False):
 def check_programs_exist(
     programs, raise_on_error=True, package_name=None, logger=logger
 ):
-
     """This function checks that all of the programs in the list cam be
     called from python. After checking all of the programs, an exception
     is raised if any of them are not callable. Optionally, only a warning
@@ -172,7 +171,6 @@ def check_programs_exist(
 
 
 def check_call_step(cmd, current_step=-1, init_step=-1, call=True, raise_on_error=True):
-
     logging.info(cmd)
     ret_code = 0
 
@@ -205,7 +203,6 @@ def check_call(cmd, call=True, raise_on_error=True):
 
 
 def check_output_step(cmd, current_step=0, init_step=0, raise_on_error=True):
-
     logging.info(cmd)
     if current_step >= init_step:
         logging.info("calling")
@@ -240,7 +237,6 @@ def call_if_not_exists(
     to_delete=[],
     keep_delete_files=False,
 ):
-
     """This function checks if out_file exists. If it does not, or if overwrite
     is true, then the command is executed, according to the call flag.
     Otherwise, a warning is issued stating that the file already exists

diff --git a/src/pbiotools/misc/slurm.py b/src/pbiotools/misc/slurm.py
@@ -33,7 +33,6 @@ def check_sbatch(
     stderr_file=None,
     args=None,
 ):
-
     """This function wraps calls to sbatch. It adds the relevant command line
     options based on the parameters (either specified or extracted from
     args, if args is not None).
@@ -234,7 +233,6 @@ def add_sbatch_options(
     mail_type=["FAIL", "TIME_LIMIT"],
     mail_user=None,
 ):
-
     """This function adds the options for calling sbatch to the given parser.
     The provided arguments are used as defaults for the options.
 

diff --git a/src/pbiotools/misc/utils.py b/src/pbiotools/misc/utils.py
@@ -222,14 +222,13 @@ def get_vars_to_save(to_save, to_remove=["parser", "args"]):
     import types
 
     # remove the system variables, modules and functions
-    for (var_name, value) in to_save.items():
+    for var_name, value in to_save.items():
         if var_name.startswith("__"):
             to_remove.append(var_name)
 
         elif isinstance(value, types.FunctionType) or isinstance(
             value, types.ModuleType
         ):
-
             to_remove.append(var_name)
 
     for var_name in to_remove:
@@ -755,7 +754,7 @@ def write_df(
     filetype="AUTO",
     sheet="Sheet_1",
     do_not_compress=False,
-    **kwargs
+    **kwargs,
 ):
     """This function writes a data frame to a file of the specified type.
     Unless otherwise specified, csv files are gzipped when written. By
@@ -1290,9 +1289,8 @@ def call_func_if_not_exists(
     file_checkers=None,
     to_delete=[],
     keep_delete_files=False,
-    **kwargs
+    **kwargs,
 ):
-
     """Call a python function with extra checks on input/output files, etc.
     This is adapted from shell_utils.call_if_not_exists, see this function
     for more details.

diff --git a/src/pbiotools/utils/bed_utils.py b/src/pbiotools/utils/bed_utils.py
@@ -43,7 +43,7 @@ def read_bed(
     comment=None,
     header=None,
     use_default_field_names=False,
-    **kwargs
+    **kwargs,
 ):
     """This function reads a bed file into a pandas data frame. By default, it
     assumes the first line of the bed file actually gives the field names,
@@ -133,7 +133,7 @@ def write_bed(data_frame, filename, compress=True, **kwargs):
         header=header,
         do_not_compress=do_not_compress,
         quoting=csv.QUOTE_NONE,
-        **kwargs
+        **kwargs,
     )
 
 
@@ -1306,7 +1306,6 @@ def merge_intervals(interval_starts, interval_ends, interval_info=None):
         # and advance
         next_interval += 1
         if next_interval < num_intervals:
-
             next_interval_start = interval_starts[next_interval]
             next_interval_end = interval_ends[next_interval]
 
@@ -1430,7 +1429,6 @@ def merge_all_intervals(bed, split=False):
 def get_position_intersections(
     positions, interval_starts, interval_ends, interval_info=None, position_info=None
 ):
-
     """This function finds the intersections of a set of (1bp) points and a
     set of intervals, specified by (inclusive) start and (exclusive) end
     positions. Furthermore, it allows arbitrary information to be attached
@@ -1530,10 +1528,8 @@ def get_position_intersections(
     next_exon_start = interval_starts[0]
 
     while next_p_site_position != np.inf:
-
         # do we grab the p_site or the exon
         if next_p_site_position < next_exon_start:
-
             # then we take the p_site
 
             # first, remove everything from the cache which ends before this
@@ -1742,10 +1738,8 @@ def get_exact_interval_matches(a_starts, a_ends, a_info, b_starts, b_ends, b_inf
 
     matches = []
     while next_a_interval < num_a_intervals:
-
         # get whichever interval comes next
         if next_a_start < next_b_start:
-
             # check if this exactly matches anything in the cache
             for c in cache:
                 starts = b_starts[c] == next_a_start
@@ -1768,7 +1762,6 @@ def get_exact_interval_matches(a_starts, a_ends, a_info, b_starts, b_ends, b_inf
             next_a_end = a_ends[next_a_interval]
 
         else:
-
             # just add it to the cache
             cache.append(next_b_interval)
 
@@ -1787,7 +1780,6 @@ def get_exact_interval_matches(a_starts, a_ends, a_info, b_starts, b_ends, b_inf
 def get_exact_block_matches(
     matches, block_counts_a, block_counts_b=None, block_id_index=None
 ):
-
     """This function finds pairs of transcripts (or whatever outer-level
     object is considered) which have exact interval matches for all of
     their blocks (i.e., exons). Roughly, it does this by counting the
@@ -2135,10 +2127,8 @@ def get_interval_overlaps(a_starts, a_ends, a_info, b_starts, b_ends, b_info):
 
     matches = []
     while (next_a_interval < num_a_intervals) or (len(a_cache) != 0):
-
         # get whichever interval comes next
         if next_a_start < next_b_start:
-
             # a is first
 
             # remove everything in the b_cache which ends before this starts
@@ -2211,7 +2201,6 @@ def get_interval_overlaps(a_starts, a_ends, a_info, b_starts, b_ends, b_info):
 
 
 def get_transcript_overlaps(interval_overlaps):
-
     """This function finds pairs of transcripts (or whatever outer-level
     object is considered) which have interval matches across multiple
     blocks and counts the total overlap.
@@ -2409,7 +2398,6 @@ def get_bed_overlaps(
 
     for seqname in seqnames:
         for strand in strands:
-
             m_bed_a_seqname = bed_a["seqname"] == seqname
             m_bed_b_seqname = bed_b["seqname"] == seqname
 
@@ -2548,7 +2536,6 @@ def get_entries_with_upstream_overlaps(
     exons_a=None,
     exons_b=None,
 ):
-
     """This function finds all intervals of A which have upstream intervals of B.
     It always takes the strand of the intervals into account. By default, the
     function looks for intervals in B which are strictly upstream of the
@@ -2721,7 +2708,6 @@ def get_bed_sequence(bed_entry, seq_sequence, split_exons=True):
         transcript_sequence = seq_sequence[genomic_start:genomic_end]
 
     else:
-
         exon_starts = np.fromstring(
             bed_entry["exon_genomic_relative_starts"], sep=",", dtype=int
         )
@@ -2819,7 +2805,7 @@ def get_all_bed_sequences(
 
     all_transcript_sequences = []
 
-    for (seqname, sequence) in fasta:
+    for seqname, sequence in fasta:
         msg = "Processing seqname: {}".format(seqname)
         logger.debug(msg)
 

diff --git a/src/pbiotools/utils/bio.py b/src/pbiotools/utils/bio.py
@@ -85,7 +85,6 @@ def read_bitseq_tr_file(
     comment="#",
     sep=" ",
 ):
-
     """This function reads the BitSeq transcript_info file into a data frame.
     The file is assumed to contain four columns: source_name,
     transcript_name, length and effective_length.
@@ -114,7 +113,6 @@ def read_bitseq_tr_file(
 def read_maxquant_peptides_file(
     filename, names=None, header="infer", comment="#", sep="\t"
 ):
-
     """This function reads the peptides.txt file produced by MaxQuant into a
     data frame. By default, the file is assumed to be tab-delimited, and
     the first row is used as the column names.
@@ -143,7 +141,6 @@ def read_maxquant_peptides_file(
 def read_protein_digestion_simulator_file(
     filename, names=None, header="infer", comment="#", sep="\t"
 ):
-
     """This function reads the output of the Protein Digestion Simulator program
     (https://omics.pnl.gov/software/protein-digestion-simulator). By default,
     the file is assumed to  be tab-delimited and the first row is used as