Skip to content

Commit

Permalink
adding docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
liliblu committed May 20, 2020
1 parent 64b0683 commit 6d8f2ee
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 18 deletions.
2 changes: 1 addition & 1 deletion phosphodisco/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ def analyze_aa_sequences(
Returns: self with module_freqs and module_aa_enrichment attributes.
"""
self.module_freqs = {
self.module_aa_freqs = {
module: pd.DataFrame([Counter(tup) for tup in list(zip(*aas))])
for module, aas in self.module_sequences.items()
}
Expand Down
65 changes: 59 additions & 6 deletions phosphodisco/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@


def get_sep(file_path: str) -> str:
"""
"""Figure out the sep based on file name. Only helps with tsv and csv.
Args:
file_path:
file_path: Path of file.
Returns:
Returns: sep
"""
if file_path[-4:] == '.tsv':
Expand All @@ -21,32 +21,77 @@ def get_sep(file_path: str) -> str:


def read_protein(file_path: str) -> DataFrame:
"""Reads in protein abundance values. Proteins as rows, samples as columns.
First column must be protein identifier.
Args:
file_path: Path to protein csv or tsv.
Returns: DataFrame with proteins as rows, samples as columns.
"""
sep = get_sep(file_path)
return pd.read_csv(file_path, sep=sep, index_col=0).replace(
['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
).astype(float)


def read_annotation(file_path: str) -> DataFrame:
"""Reads in sample annotation file. Sample as rows, annotations as columns.
Args:
file_path: Path to protein csv or tsv. First column must be sample identifier.
Returns: DataFrame with samples as rows, annotations as columns.
"""
sep = get_sep(file_path)
return pd.read_csv(file_path, sep=sep, index_col=0).replace(
['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
)


def read_phospho(file_path: str) -> Optional[DataFrame]:
"""Reads in protein abundance values. Proteins as rows, samples as columns. First two columns
must be protein, variable stie identifiers, respectively. Can use this for raw or normalized
phospho data tables.
Args:
file_path: Path to protein csv or tsv.
Returns: DataFrame with phosphosites as rows, samples as columns.
"""
sep = get_sep(file_path)
return pd.read_csv(file_path, sep=sep, index_col=[0, 1]).replace(
['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
).astype(float)


def read_list(file_path: str):
"""Reads in a \n separated file of things into a list.
Args:
file_path: Path to file.
Returns: List
"""
with open(file_path, 'r') as fh:
return [s.strip() for s in fh.readlines()]


def column_normalize(df: DataFrame, method: str) -> DataFrame:
"""Normalizes samples for coverage.
Args:
df: DataFrame to column normalize.
method: Which method to use: 'median_of_ratios', 'median', 'upper_quartile' currently
accepted.
Returns: Normalized DataFrame.
"""
if method == "median_of_ratios":
return df.divide(df.divide(df.mean(axis=1), axis=0).median())

Expand All @@ -56,9 +101,9 @@ def column_normalize(df: DataFrame, method: str) -> DataFrame:
if method == "upper_quartile":
return df.divide(np.nanquantile(df, 0.75))

if method == "twocomp_median":
pass
#TODO make two comp
# if method == "quantile":
# pass
#TODO add two comp

raise ValueError(
'Passed method not valid. Must be one of: median_of_ratios, median, upper_quartile, '
Expand All @@ -67,6 +112,14 @@ def column_normalize(df: DataFrame, method: str) -> DataFrame:


def read_fasta(fasta_file) -> dict:
"""Parse fasta into a dictionary.
Args:
fasta_file: path to fasta file.
Returns: dictionary of genes: seq.
"""
with open(fasta_file, 'r') as fh:
aa_seqs = {
seq.split()[0]: seq.split(']')[-1].replace('\s', '').replace('\n', '')
Expand Down
86 changes: 75 additions & 11 deletions phosphodisco/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@


def compute_order(
df,
optimal=True,
dist_method="euclidean",
cluster_method="average"
df: DataFrame,
optimal: bool = True,
dist_method: str="euclidean",
cluster_method: str="average"
):
"""
"""Computes order of samples for clustered heatmaps.
Args:
df:
optimal:
dist_method:
cluster_method:
df: Data with rows to cluster.
optimal: Whether to return optimal ordering. Slows stuff down.
dist_method: Which distance calculation to use.
cluster_method: Which hierarchical clustering method to use.
Returns:
Returns: Clustered order of rows.
"""
dist_mat = pdist(df, metric=dist_method)
Expand All @@ -54,6 +54,24 @@ def visualize_modules(
heatmap_kws: dict = {},
file_prefix: str = 'heatmap'
):
"""Makes heatmap figures of sites vs samples for each module.
Args:
data: ProteomicsData object containing AT LEAST normed_phospho data with no missing
values (maybe imputed), and modules assigned.
annotations: A DataFrame with samples as rows and categorical annotations to visualize as
columns. This isn't taken directly from ProteomicsData because that table may have many
more columns than anyone wants to visualize.
col_cluster: Whether to cluster samples in the heatmaps.
row_cluster: Whether to cluster rows in the heatmaps.
cluster_kws: Additional keyword args to pass to visualize.compute_order
annot_kws: Additional keyword args to pass to catheat.heatmap
heatmap_kws: Additional keyword args to pass to sns.heatmap
file_prefix: File prefix for each figure. Suffix will be .clusterX.pdf
Returns: None
"""

cluster_sets = data.modules
cluster_sets = {
Expand Down Expand Up @@ -88,7 +106,7 @@ def visualize_modules(
fig_len = 0.25*(len(df) + len(header))
fig_width = 0.15*len(col_order)

fig = plt.figure(figsize=(fig_width, fig_len))
_ = plt.figure(figsize=(fig_width, fig_len))
gs = plt.GridSpec(
nrows=3, ncols=2,
height_ratios=[len(header)]+2*[len(df)/2],
Expand Down Expand Up @@ -135,6 +153,18 @@ def visualize_regulator_coefficients(
savefig_prefix: Optional[str] = None,
**heatmap_kwargs
):
"""Visualizes the associations between putative regulators and modules.
Args:
data: ProteomicsData object with regulator_coefficients assigned.
percentile_cutoff: Heatmaps are filtered to show high associations only. What threshold
should be used.
savefig_prefix: If prefix is provided, a figure will be saved with this prefix.
**heatmap_kwargs: Additional keyword args for sns.heatmap
Returns: matplotlib ax with heatmap of coefficients
"""
if data.regulator_coefficients is None:
raise KeyError(
'Must calculate regulator coefficients using '
Expand All @@ -156,6 +186,18 @@ def visualize_annotation_associations(
savefig_prefix: Optional[str] = None,
**heatmap_kwargs
):
"""Visualizes the associations between sample annotations and modules.
Args:
data: ProteomicsData object with annotation_association_FDR assigned.
percentile_cutoff: Heatmaps are filtered to show high associations only. What
percentile threshold should be used.
savefig_prefix: If prefix is provided, a figure will be saved with this prefix.
**heatmap_kwargs: Additional keyword args for sns.heatmap
Returns: matplotlib ax with heatmap of associations
"""
if data.annotation_association_FDR is None:
raise KeyError(
'Must calculate regulator coefficients using '
Expand All @@ -171,6 +213,17 @@ def visualize_annotation_associations(


def visualize_aa(seq_dfs, save_prefix: Optional[str] = None, **logo_kws):
"""Draws logos of each amino acid sequence motif for each module. Can pass in either
module_aa_freqs or module_aa_enrichment from ProteomicsData objects.
Args:
seq_dfs: Either module_aa_freqs or module_aa_enrichment from ProteomicsData objects.
save_prefix: If prefix is provided, a figure will be saved with this prefix.
**logo_kws: Additional keyword args for logomaker.Logo
Returns: None
"""
for module, ps in seq_dfs.items():
logo_kws['color_scheme'] = logo_kws.get('color_scheme', 'NajafabadiEtAl2017')
logo = logomaker.Logo(ps, **logo_kws)
Expand All @@ -185,6 +238,17 @@ def visualize_set_enrichment(
save_prefix: Optional[str] = None,
**barplot_kws
):
"""Draws barplots for either enrichr or ptm-ssGSEA set enrichments per module.
Args:
module_enrichment_dict: go_enrichment or ptm_enrichment from ProteomicsData objects.
pval_cutoff: p-val cut off to filter for significant enrichments.
save_prefix: If prefix is provided, a figure will be saved with this prefix.
**barplot_kws: Additional keyword args for sns.barplot
Returns: None
"""
barplot_kws['color'] = barplot_kws.get('color', '#BDBDBD')
for module, df in module_enrichment_dict.items():
temp = df[df['Adjusted P-value'] < pval_cutoff]
Expand Down

0 comments on commit 6d8f2ee

Please sign in to comment.