Skip to content

Commit

Permalink
Merge pull request #6 from liliblu/sanitizing_inputs
Browse files Browse the repository at this point in the history
Sanitizing inputs
  • Loading branch information
tobsecret authored Apr 26, 2021
2 parents ecf0ab5 + c2f0b2e commit 90460e4
Showing 1 changed file with 83 additions and 1 deletion.
84 changes: 83 additions & 1 deletion phosphodisco/parsers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import numpy as np
from pandas import DataFrame
from typing import Optional, Iterable
from typing import Optional, Iterable, Tuple, List, Union


def get_sep(file_path: str) -> str:
Expand Down Expand Up @@ -35,6 +35,88 @@ def read_protein(file_path: str) -> DataFrame:
['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
).astype(float)

def read_gct(path: str,
index_cols: Optional[List[str]]=['geneSymbol', 'variableSites'],
regex: Optional[Union[str, None]]=None,
sample_cols: Optional[Union[List, None]]=None,
annotation_rows: Optional[Union[List, None]]=None
) -> Tuple[DataFrame, DataFrame]:
"""
Reads in a gct file and formats the dataframe so it's ready for phospho disco
path: path to file.gct
index_cols: columns which to use as an index. For phospho/acetyl, etc this should
be two columns e.g. ['geneSymbol', 'variableSites'] whereas for protein it's one column, e.g. ['geneSymbol']
regex: [optional] regular expression to quickly subselect sample columns e.g.
to get only sample columns that end in a digit: r'.*\d$'
sample_cols: [optional] to select sample columns using exact names; not used if you provided a regex
returns: pd.DataFrame with sample columns and index_cols
"""
with open(path, 'r') as handle:
next(handle)
#the 2nd row of a gct file gives us the dimensions
nrows, ncols, nrowmeta, ncolsmeta = [int(i) for i in next(handle).split()]
df = pd.read_csv(
path, sep='\t', skiprows=2, low_memory=False
).replace(
['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
)
# the metadatatable is transposed in the gct file, hence we are indexing everything but
# the first ncolsmeta rows, and everything but the first nrowsmeta columns
sample_df = df.set_index(index_cols).iloc[ncolsmeta:, nrowmeta-len(index_cols)+1:].copy()
annots_df = df.set_index(df.columns[0]).iloc[:ncolsmeta, nrowmeta:].copy()
if regex is not None:
sample_df = sample_df.loc[:,sample_df.columns.str.match(regex)]
annots_df = annots_df.loc[:,annots_df.columns.str.match(regex)]
elif sample_cols is not None:
try:
sample_df = sample_df.loc[:, sample_cols]
annots_df = annots_df.loc[:, sample_cols]
except KeyError:
non_matched_cols = set(sample_cols).difference(sample_df.columns)
raise IndexError(
f"The following columns were not found in the sample columns of the provided gct file \npath:\n{path}\
\nmismatched columns:\n{non_matched_cols}"
)
if annotation_rows is not None:
try:
annots_df = annots_df.loc[annotation_rows, :]
except KeyError:
non_matched_cols = set(annotation_rows).difference(annots_df.index)
raise IndexError(
f"The following columns were not found in the annotation rows columns of the provided gct file \npath:\
\n{path}\nmismatched columns:\n{non_matched_cols}"
)

return sample_df, annots_df

def filter_dups(group:pd.DataFrame):
"""
Meant to be used in apply after a groupby call.
For a set of rows (group) find the one with the lowest number of NaNs or tied for the lowest number.
group: pd.DataFrame
"""
nan_counts = group.apply(lambda r: r.isnull().sum(), axis=1)
min_nan_counts = nan_counts.min()
return group.loc[nan_counts == min_nan_counts].head(1)

# def deduplicate_rows(
# df: pd.DataFrame,
# samples,
# sequence_col = 'sequence',
# maximize_cols = ['bestScore', 'Best_scoreVML', 'bestDeltaForwardReverseScore']
# ) -> pd.DataFrame:

# groupby = df.index.names
# df = df.reset_index()

# df[maximize_cols] = df[maximize_cols].astype(float)
# df['numNAs'] = df[samples].isnull().sum(axis=1)
# if sequence_col:
# df['len'] = df[sequence_col].str.len()
# return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs','len'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True)
# return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True)


def read_annotation(file_path: str) -> DataFrame:
"""Reads in sample annotation file. Sample as rows, annotations as columns.
Expand Down

0 comments on commit 90460e4

Please sign in to comment.