From a4795dab2bd8d60f9b97978fb1949cfb5d68e62a Mon Sep 17 00:00:00 2001
From: Tobias Schraink <tobi.schraink@gmail.com>
Date: Wed, 3 Feb 2021 21:27:51 -0500
Subject: [PATCH 1/4] added a read_gct function to parsers.py The function can
 read in both annotation and the data from a .gct file. It can also filter the
 sample columns and the metadata columns.

---
 phosphodisco/parsers.py | 55 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py
index d94f0bc..cd067d3 100755
--- a/phosphodisco/parsers.py
+++ b/phosphodisco/parsers.py
@@ -35,6 +35,61 @@ def read_protein(file_path: str) -> DataFrame:
         ['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
     ).astype(float)
 
+def read_gct(path: str, 
+             index_cols: list=['geneSymbol', 'variableSites'], 
+             regex: str=None, 
+             sample_cols: list=None, 
+             annotation_rows: list=None
+            ):
+    """
+    Reads in a gct file and formats the dataframe so it's ready for phospho disco
+    path: path to file.gct
+    index_cols:  columns which to use as an index. For phospho/acetyl, etc this should 
+                 be two columns e.g. ['geneSymbol', 'variableSites'] whereas for protein it's one column, e.g. ['geneSymbol']
+    regex:       [optional] regular expression to quickly subselect sample columns e.g.  
+                 to get only sample columns that end in a digit: r'.*\d$'
+    sample_cols: [optional] to select sample columns using exact names; not used if you provided a regex
+    
+    returns: pd.DataFrame with sample columns and index_cols
+    """
+    with open(path, 'r') as handle:
+        next(handle)
+        #the 2nd row of a gct file gives us the dimensions
+        nrows, ncols, nrowmeta, ncolsmeta = [int(i) for i in next(handle).split()] 
+    df = pd.read_csv(
+        path, sep='\t', skiprows=2, low_memory=False
+    ).replace(
+         ['na', 'NA', 'NAN', 'nan', 'NaN', 'Na'], np.nan
+    )
+    # the metadatatable is transposed in the gct file, hence we are indexing everything but 
+    # the first ncolsmeta rows, and everything but the first nrowsmeta columns
+    sample_df = df.set_index(index_cols).iloc[ncolsmeta:, nrowmeta-1:].copy()
+    annots_df = df.set_index(df.columns[0]).iloc[:ncolsmeta, nrowmeta-1:].copy()
+    if regex is not None:
+        sample_df = sample_df.loc[:,sample_df.columns.str.match(regex)]
+        annots_df = annots_df.loc[:,annots_df.columns.str.match(regex)]
+    elif sample_cols is not None:
+        try:
+            sample_df = sample_df.loc[:, sample_cols]
+            annots_df = annots_df.loc[:, sample_cols]
+        except KeyError:
+            non_matched_cols = set(sample_cols).difference(sample_df.columns)
+            raise IndexError(
+                f"The following columns were not found in the sample columns of the provided gct file \npath:\n{path}\
+                \nmismatched columns:\n{non_matched_cols}"
+            )
+    if annotation_rows is not None:
+        try:
+            annots_df = annots_df.loc[annotation_rows, :]
+        except KeyError:
+            non_matched_cols = set(annotation_rows).difference(annots_df.index)
+            raise IndexError(
+                f"The following columns were not found in the annotation rows columns of the provided gct file \npath:\
+                \n{path}\nmismatched columns:\n{non_matched_cols}"
+            )
+
+    return sample_df, annots_df
+
 
 def read_annotation(file_path: str) -> DataFrame:
     """Reads in sample annotation file. Sample as rows, annotations as columns.

From 6a9d5847aa074b3cbd876c2fc3eea7871399a311 Mon Sep 17 00:00:00 2001
From: Tobias Schraink <tobi.schraink@gmail.com>
Date: Wed, 3 Feb 2021 21:52:58 -0500
Subject: [PATCH 2/4] fixed type annotations on read_gct I just fixed the type
 annotations of the read_gct function, I think

---
 phosphodisco/parsers.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py
index cd067d3..0eaba34 100755
--- a/phosphodisco/parsers.py
+++ b/phosphodisco/parsers.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from pandas import DataFrame
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Tuple
 
 
 def get_sep(file_path: str) -> str:
@@ -36,11 +36,11 @@ def read_protein(file_path: str) -> DataFrame:
     ).astype(float)
 
 def read_gct(path: str, 
-             index_cols: list=['geneSymbol', 'variableSites'], 
-             regex: str=None, 
-             sample_cols: list=None, 
-             annotation_rows: list=None
-            ):
+             index_cols: Optional[List[str]]=['geneSymbol', 'variableSites'], 
+             regex: Optional[Union[str, None]]=None, 
+             sample_cols: Optional[Union[List, None]]=None, 
+             annotation_rows: Optional[Union[List, None]]=None
+            ) -> Tuple[DataFrame, DataFrame]:
     """
     Reads in a gct file and formats the dataframe so it's ready for phospho disco
     path: path to file.gct
@@ -90,7 +90,6 @@ def read_gct(path: str,
 
     return sample_df, annots_df
 
-
 def read_annotation(file_path: str) -> DataFrame:
     """Reads in sample annotation file. Sample as rows, annotations as columns.
 

From 2b8f2990976a7adaa94268235a4f2c3a58584265 Mon Sep 17 00:00:00 2001
From: Tobias Schraink <tobi.schraink@gmail.com>
Date: Fri, 5 Feb 2021 21:33:32 -0500
Subject: [PATCH 3/4] fixed indexing on read_gct The previous commits of the
 read_gct function were incorrectly indexing the gct file, which is now fixed.

---
 phosphodisco/parsers.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py
index 0eaba34..ac89074 100755
--- a/phosphodisco/parsers.py
+++ b/phosphodisco/parsers.py
@@ -63,8 +63,8 @@ def read_gct(path: str,
     )
     # the metadatatable is transposed in the gct file, hence we are indexing everything but 
     # the first ncolsmeta rows, and everything but the first nrowsmeta columns
-    sample_df = df.set_index(index_cols).iloc[ncolsmeta:, nrowmeta-1:].copy()
-    annots_df = df.set_index(df.columns[0]).iloc[:ncolsmeta, nrowmeta-1:].copy()
+    sample_df = df.set_index(index_cols).iloc[ncolsmeta:, nrowmeta-len(index_cols)+1:].copy()
+    annots_df = df.set_index(df.columns[0]).iloc[:ncolsmeta, nrowmeta:].copy()
     if regex is not None:
         sample_df = sample_df.loc[:,sample_df.columns.str.match(regex)]
         annots_df = annots_df.loc[:,annots_df.columns.str.match(regex)]
@@ -90,6 +90,8 @@ def read_gct(path: str,
 
     return sample_df, annots_df
 
+
+
 def read_annotation(file_path: str) -> DataFrame:
     """Reads in sample annotation file. Sample as rows, annotations as columns.
 

From c2f0b2e001e1b23e8e829a441786e535309959e3 Mon Sep 17 00:00:00 2001
From: Tobias Schraink <tobi.schraink@gmail.com>
Date: Fri, 5 Feb 2021 21:52:56 -0500
Subject: [PATCH 4/4] fixed Type hints on read_gct, added filter_dups I had
 forgotten to import List and Union. Added filter_dups which is used to filter
 duplicate lines for the ones with the least missing values.

---
 phosphodisco/parsers.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py
index ac89074..f6e2bc8 100755
--- a/phosphodisco/parsers.py
+++ b/phosphodisco/parsers.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from pandas import DataFrame
-from typing import Optional, Iterable, Tuple
+from typing import Optional, Iterable, Tuple, List, Union
 
 
 def get_sep(file_path: str) -> str:
@@ -90,6 +90,32 @@ def read_gct(path: str,
 
     return sample_df, annots_df
 
+def filter_dups(group:pd.DataFrame):
+    """
+    Meant to be used in apply after a groupby call.
+    For a set of rows (group) find the one with the lowest number of NaNs or tied for the lowest number.
+    group: pd.DataFrame
+    """
+    nan_counts = group.apply(lambda r: r.isnull().sum(), axis=1)
+    min_nan_counts = nan_counts.min()
+    return group.loc[nan_counts == min_nan_counts].head(1)
+
+# def deduplicate_rows(
+#     df: pd.DataFrame,
+#     samples,
+#     sequence_col = 'sequence',
+#     maximize_cols = ['bestScore', 'Best_scoreVML', 'bestDeltaForwardReverseScore']
+#     ) -> pd.DataFrame:
+
+#     groupby = df.index.names
+#     df = df.reset_index()
+
+#     df[maximize_cols] = df[maximize_cols].astype(float)
+#     df['numNAs'] = df[samples].isnull().sum(axis=1)
+#     if sequence_col:
+#         df['len'] = df[sequence_col].str.len()
+#         return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs','len'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True)
+#     return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True)
 
 
 def read_annotation(file_path: str) -> DataFrame: