From a4795dab2bd8d60f9b97978fb1949cfb5d68e62a Mon Sep 17 00:00:00 2001 From: Tobias Schraink Date: Wed, 3 Feb 2021 21:27:51 -0500 Subject: [PATCH 1/4] added a read_gct function to parsers.py The function can read in both annotation and the data from a .gct file. It can also filter the sample columns and the metadata columns. --- phosphodisco/parsers.py | 55 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py index d94f0bc..cd067d3 100755 --- a/phosphodisco/parsers.py +++ b/phosphodisco/parsers.py @@ -35,6 +35,61 @@ def read_protein(file_path: str) -> DataFrame: ['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan ).astype(float) +def read_gct(path: str, + index_cols: list=['geneSymbol', 'variableSites'], + regex: str=None, + sample_cols: list=None, + annotation_rows: list=None + ): + """ + Reads in a gct file and formats the dataframe so it's ready for phospho disco + path: path to file.gct + index_cols: columns which to use as an index. For phospho/acetyl, etc this should + be two columns e.g. ['geneSymbol', 'variableSites'] whereas for protein it's one column, e.g. ['geneSymbol'] + regex: [optional] regular expression to quickly subselect sample columns e.g. + to get only sample columns that end in a digit: r'.*\d$' + sample_cols: [optional] to select sample columns using exact names; not used if you provided a regex + + returns: pd.DataFrame with sample columns and index_cols + """ + with open(path, 'r') as handle: + next(handle) + #the 2nd row of a gct file gives us the dimensions + nrows, ncols, nrowmeta, ncolsmeta = [int(i) for i in next(handle).split()] + df = pd.read_csv( + path, sep='\t', skiprows=2, low_memory=False + ).replace( + ['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan + ) + # the metadatatable is transposed in the gct file, hence we are indexing everything but + # the first ncolsmeta rows, and everything but the first nrowsmeta columns + sample_df = df.set_index(index_cols).iloc[ncolsmeta:, nrowmeta-1:].copy() + annots_df = df.set_index(df.columns[0]).iloc[:ncolsmeta, nrowmeta-1:].copy() + if regex is not None: + sample_df = sample_df.loc[:,sample_df.columns.str.match(regex)] + annots_df = annots_df.loc[:,annots_df.columns.str.match(regex)] + elif sample_cols is not None: + try: + sample_df = sample_df.loc[:, sample_cols] + annots_df = annots_df.loc[:, sample_cols] + except KeyError: + non_matched_cols = set(sample_cols).difference(sample_df.columns) + raise IndexError( + f"The following columns were not found in the sample columns of the provided gct file \npath:\n{path}\ + \nmismatched columns:\n{non_matched_cols}" + ) + if annotation_rows is not None: + try: + annots_df = annots_df.loc[annotation_rows, :] + except KeyError: + non_matched_cols = set(annotation_rows).difference(annots_df.index) + raise IndexError( + f"The following columns were not found in the annotation rows columns of the provided gct file \npath:\ + \n{path}\nmismatched columns:\n{non_matched_cols}" + ) + + return sample_df, annots_df + def read_annotation(file_path: str) -> DataFrame: """Reads in sample annotation file. Sample as rows, annotations as columns. From 6a9d5847aa074b3cbd876c2fc3eea7871399a311 Mon Sep 17 00:00:00 2001 From: Tobias Schraink Date: Wed, 3 Feb 2021 21:52:58 -0500 Subject: [PATCH 2/4] fixed type annotations on read_gct I just fixed the type annotations of the read_gct function, I think --- phosphodisco/parsers.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py index cd067d3..0eaba34 100755 --- a/phosphodisco/parsers.py +++ b/phosphodisco/parsers.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np from pandas import DataFrame -from typing import Optional, Iterable +from typing import Optional, Iterable, Tuple def get_sep(file_path: str) -> str: @@ -36,11 +36,11 @@ def read_protein(file_path: str) -> DataFrame: ).astype(float) def read_gct(path: str, - index_cols: list=['geneSymbol', 'variableSites'], - regex: str=None, - sample_cols: list=None, - annotation_rows: list=None - ): + index_cols: Optional[List[str]]=['geneSymbol', 'variableSites'], + regex: Optional[Union[str, None]]=None, + sample_cols: Optional[Union[List, None]]=None, + annotation_rows: Optional[Union[List, None]]=None + ) -> Tuple[DataFrame, DataFrame]: """ Reads in a gct file and formats the dataframe so it's ready for phospho disco path: path to file.gct @@ -90,7 +90,6 @@ def read_gct(path: str, return sample_df, annots_df - def read_annotation(file_path: str) -> DataFrame: """Reads in sample annotation file. Sample as rows, annotations as columns. From 2b8f2990976a7adaa94268235a4f2c3a58584265 Mon Sep 17 00:00:00 2001 From: Tobias Schraink Date: Fri, 5 Feb 2021 21:33:32 -0500 Subject: [PATCH 3/4] fixed indexing on read_gct The previous commits of the read_gct function were incorrectly indexing the gct file, which is now fixed. --- phosphodisco/parsers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py index 0eaba34..ac89074 100755 --- a/phosphodisco/parsers.py +++ b/phosphodisco/parsers.py @@ -63,8 +63,8 @@ def read_gct(path: str, ) # the metadatatable is transposed in the gct file, hence we are indexing everything but # the first ncolsmeta rows, and everything but the first nrowsmeta columns - sample_df = df.set_index(index_cols).iloc[ncolsmeta:, nrowmeta-1:].copy() - annots_df = df.set_index(df.columns[0]).iloc[:ncolsmeta, nrowmeta-1:].copy() + sample_df = df.set_index(index_cols).iloc[ncolsmeta:, nrowmeta-len(index_cols)+1:].copy() + annots_df = df.set_index(df.columns[0]).iloc[:ncolsmeta, nrowmeta:].copy() if regex is not None: sample_df = sample_df.loc[:,sample_df.columns.str.match(regex)] annots_df = annots_df.loc[:,annots_df.columns.str.match(regex)] @@ -90,6 +90,8 @@ def read_gct(path: str, return sample_df, annots_df + + def read_annotation(file_path: str) -> DataFrame: """Reads in sample annotation file. Sample as rows, annotations as columns. From c2f0b2e001e1b23e8e829a441786e535309959e3 Mon Sep 17 00:00:00 2001 From: Tobias Schraink Date: Fri, 5 Feb 2021 21:52:56 -0500 Subject: [PATCH 4/4] fixed Type hints on read_gct, added filter_dups I had forgotten to import List and Union. Added filter_dups which is used to filter duplicate lines for the ones with the least missing values. --- phosphodisco/parsers.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py index ac89074..f6e2bc8 100755 --- a/phosphodisco/parsers.py +++ b/phosphodisco/parsers.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np from pandas import DataFrame -from typing import Optional, Iterable, Tuple +from typing import Optional, Iterable, Tuple, List, Union def get_sep(file_path: str) -> str: @@ -90,6 +90,32 @@ def read_gct(path: str, return sample_df, annots_df +def filter_dups(group:pd.DataFrame): + """ + Meant to be used in apply after a groupby call. + For a set of rows (group) find the one with the lowest number of NaNs or tied for the lowest number. + group: pd.DataFrame + """ + nan_counts = group.apply(lambda r: r.isnull().sum(), axis=1) + min_nan_counts = nan_counts.min() + return group.loc[nan_counts == min_nan_counts].head(1) + +# def deduplicate_rows( +# df: pd.DataFrame, +# samples, +# sequence_col = 'sequence', +# maximize_cols = ['bestScore', 'Best_scoreVML', 'bestDeltaForwardReverseScore'] +# ) -> pd.DataFrame: + +# groupby = df.index.names +# df = df.reset_index() + +# df[maximize_cols] = df[maximize_cols].astype(float) +# df['numNAs'] = df[samples].isnull().sum(axis=1) +# if sequence_col: +# df['len'] = df[sequence_col].str.len() +# return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs','len'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True) +# return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True) def read_annotation(file_path: str) -> DataFrame: