adding docstrings

ruggleslab · May 20, 2020 · 6d8f2ee · 6d8f2ee
1 parent 64b0683
commit 6d8f2ee
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 18 deletions.
diff --git a/phosphodisco/classes.py b/phosphodisco/classes.py
@@ -568,7 +568,7 @@ def analyze_aa_sequences(
         Returns: self with module_freqs and module_aa_enrichment attributes.
 
         """
-        self.module_freqs = {
+        self.module_aa_freqs = {
             module: pd.DataFrame([Counter(tup) for tup in list(zip(*aas))])
             for module, aas in self.module_sequences.items()
         }

diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py
@@ -5,12 +5,12 @@
 
 
 def get_sep(file_path: str) -> str:
-    """
+    """Figure out the sep based on file name. Only helps with tsv and csv.
 
     Args:
-        file_path:
+        file_path: Path of file.
 
-    Returns:
+    Returns: sep
 
     """
     if file_path[-4:] == '.tsv':
@@ -21,32 +21,77 @@ def get_sep(file_path: str) -> str:
 
 
 def read_protein(file_path: str) -> DataFrame:
+    """Reads in protein abundance values. Proteins as rows, samples as columns.
+    First column must be protein identifier.
+
+    Args:
+        file_path: Path to protein csv or tsv.
+
+    Returns: DataFrame with proteins as rows, samples as columns.
+
+    """
     sep = get_sep(file_path)
     return pd.read_csv(file_path, sep=sep, index_col=0).replace(
         ['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
     ).astype(float)
 
 
 def read_annotation(file_path: str) -> DataFrame:
+    """Reads in sample annotation file. Sample as rows, annotations as columns.
+
+    Args:
+        file_path: Path to protein csv or tsv. First column must be sample identifier.
+
+    Returns: DataFrame with samples as rows, annotations as columns.
+
+    """
     sep = get_sep(file_path)
     return pd.read_csv(file_path, sep=sep, index_col=0).replace(
         ['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
     )
 
 
 def read_phospho(file_path: str) -> Optional[DataFrame]:
+    """Reads in protein abundance values. Proteins as rows, samples as columns. First two columns
+    must be protein, variable stie identifiers, respectively. Can use this for raw or normalized
+    phospho data tables.
+
+    Args:
+        file_path: Path to protein csv or tsv.
+
+    Returns: DataFrame with phosphosites as rows, samples as columns.
+
+    """
     sep = get_sep(file_path)
     return pd.read_csv(file_path, sep=sep, index_col=[0, 1]).replace(
         ['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
     ).astype(float)
 
 
 def read_list(file_path: str):
+    """Reads in a \n separated file of things into a list.
+
+    Args:
+        file_path: Path to file.
+
+    Returns: List
+
+    """
     with open(file_path, 'r') as fh:
         return [s.strip() for s in fh.readlines()]
 
 
 def column_normalize(df: DataFrame, method: str) -> DataFrame:
+    """Normalizes samples for coverage.
+
+    Args:
+        df: DataFrame to column normalize.
+        method: Which method to use: 'median_of_ratios', 'median', 'upper_quartile' currently
+        accepted.
+
+    Returns: Normalized DataFrame.
+
+    """
     if method == "median_of_ratios":
         return df.divide(df.divide(df.mean(axis=1), axis=0).median())
 
@@ -56,9 +101,9 @@ def column_normalize(df: DataFrame, method: str) -> DataFrame:
     if method == "upper_quartile":
         return df.divide(np.nanquantile(df, 0.75))
 
-    if method == "twocomp_median":
-        pass
-        #TODO make two comp
+    # if method == "quantile":
+    #     pass
+        #TODO add two comp
 
     raise ValueError(
         'Passed method not valid. Must be one of: median_of_ratios, median, upper_quartile, '
@@ -67,6 +112,14 @@ def column_normalize(df: DataFrame, method: str) -> DataFrame:
 
 
 def read_fasta(fasta_file) -> dict:
+    """Parse fasta into a dictionary.
+
+    Args:
+        fasta_file: path to fasta file.
+
+    Returns: dictionary of genes: seq.
+
+    """
     with open(fasta_file, 'r') as fh:
         aa_seqs = {
             seq.split()[0]: seq.split(']')[-1].replace('\s', '').replace('\n', '')

diff --git a/phosphodisco/visualize.py b/phosphodisco/visualize.py
@@ -19,20 +19,20 @@
 
 
 def compute_order(
-        df,
-        optimal=True,
-        dist_method="euclidean",
-        cluster_method="average"
+        df: DataFrame,
+        optimal: bool = True,
+        dist_method: str="euclidean",
+        cluster_method: str="average"
 ):
-    """
+    """Computes order of samples for clustered heatmaps.
 
     Args:
-        df:
-        optimal:
-        dist_method:
-        cluster_method:
+        df: Data with rows to cluster.
+        optimal: Whether to return optimal ordering. Slows stuff down.
+        dist_method: Which distance calculation to use.
+        cluster_method: Which hierarchical clustering method to use.
 
-    Returns:
+    Returns: Clustered order of rows.
 
     """
     dist_mat = pdist(df, metric=dist_method)
@@ -54,6 +54,24 @@ def visualize_modules(
         heatmap_kws: dict = {},
         file_prefix: str = 'heatmap'
 ):
+    """Makes heatmap figures of sites vs samples for each module.
+
+    Args:
+        data: ProteomicsData object containing AT LEAST normed_phospho data with no missing
+        values (maybe imputed), and modules assigned.
+        annotations: A DataFrame with samples as rows and categorical annotations to visualize as
+        columns. This isn't taken directly from ProteomicsData because that table may have many
+        more columns than anyone wants to visualize.
+        col_cluster: Whether to cluster samples in the heatmaps.
+        row_cluster: Whether to cluster rows in the heatmaps.
+        cluster_kws: Additional keyword args to pass to visualize.compute_order
+        annot_kws: Additional keyword args to pass to catheat.heatmap
+        heatmap_kws: Additional keyword args to pass to sns.heatmap
+        file_prefix: File prefix for each figure. Suffix will be .clusterX.pdf
+
+    Returns: None
+
+    """
 
     cluster_sets = data.modules
     cluster_sets = {
@@ -88,7 +106,7 @@ def visualize_modules(
         fig_len = 0.25*(len(df) + len(header))
         fig_width = 0.15*len(col_order)
 
-        fig = plt.figure(figsize=(fig_width, fig_len))
+        _ = plt.figure(figsize=(fig_width, fig_len))
         gs = plt.GridSpec(
             nrows=3, ncols=2,
             height_ratios=[len(header)]+2*[len(df)/2],
@@ -135,6 +153,18 @@ def visualize_regulator_coefficients(
         savefig_prefix: Optional[str] = None,
         **heatmap_kwargs
 ):
+    """Visualizes the associations between putative regulators and modules.
+
+    Args:
+        data: ProteomicsData object with regulator_coefficients assigned.
+        percentile_cutoff: Heatmaps are filtered to show high associations only. What threshold
+        should be used.
+        savefig_prefix: If prefix is provided, a figure will be saved with this prefix.
+        **heatmap_kwargs: Additional keyword args for sns.heatmap
+
+    Returns: matplotlib ax with heatmap of coefficients
+
+    """
     if data.regulator_coefficients is None:
         raise KeyError(
             'Must calculate regulator coefficients using '
@@ -156,6 +186,18 @@ def visualize_annotation_associations(
         savefig_prefix: Optional[str] = None,
         **heatmap_kwargs
 ):
+    """Visualizes the associations between sample annotations and modules.
+
+    Args:
+        data: ProteomicsData object with annotation_association_FDR assigned.
+        percentile_cutoff: Heatmaps are filtered to show high associations only. What
+        percentile threshold should be used.
+        savefig_prefix: If prefix is provided, a figure will be saved with this prefix.
+        **heatmap_kwargs: Additional keyword args for sns.heatmap
+
+    Returns: matplotlib ax with heatmap of associations
+
+    """
     if data.annotation_association_FDR is None:
         raise KeyError(
             'Must calculate regulator coefficients using '
@@ -171,6 +213,17 @@ def visualize_annotation_associations(
 
 
 def visualize_aa(seq_dfs, save_prefix: Optional[str] = None, **logo_kws):
+    """Draws logos of each amino acid sequence motif for each module. Can pass in either
+    module_aa_freqs or module_aa_enrichment from ProteomicsData objects.
+
+    Args:
+        seq_dfs: Either module_aa_freqs or module_aa_enrichment from ProteomicsData objects.
+        save_prefix: If prefix is provided, a figure will be saved with this prefix.
+        **logo_kws: Additional keyword args for logomaker.Logo
+
+    Returns: None
+
+    """
     for module, ps in seq_dfs.items():
         logo_kws['color_scheme'] = logo_kws.get('color_scheme', 'NajafabadiEtAl2017')
         logo = logomaker.Logo(ps, **logo_kws)
@@ -185,6 +238,17 @@ def visualize_set_enrichment(
         save_prefix: Optional[str] = None,
         **barplot_kws
 ):
+    """Draws barplots for either enrichr or ptm-ssGSEA set enrichments per module.
+
+    Args:
+        module_enrichment_dict: go_enrichment or ptm_enrichment from ProteomicsData objects.
+        pval_cutoff: p-val cut off to filter for significant enrichments.
+        save_prefix: If prefix is provided, a figure will be saved with this prefix.
+        **barplot_kws: Additional keyword args for sns.barplot
+
+    Returns: None
+
+    """
     barplot_kws['color'] = barplot_kws.get('color', '#BDBDBD')
     for module, df in module_enrichment_dict.items():
         temp = df[df['Adjusted P-value'] < pval_cutoff]