From c2f0b2e001e1b23e8e829a441786e535309959e3 Mon Sep 17 00:00:00 2001
From: Tobias Schraink <tobi.schraink@gmail.com>
Date: Fri, 5 Feb 2021 21:52:56 -0500
Subject: [PATCH] fixed Type hints on read_gct, added filter_dups I had
 forgotten to import List and Union. Added filter_dups which is used to filter
 duplicate lines for the ones with the least missing values.

---
 phosphodisco/parsers.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/phosphodisco/parsers.py b/phosphodisco/parsers.py
index ac89074..f6e2bc8 100755
--- a/phosphodisco/parsers.py
+++ b/phosphodisco/parsers.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from pandas import DataFrame
-from typing import Optional, Iterable, Tuple
+from typing import Optional, Iterable, Tuple, List, Union
 
 
 def get_sep(file_path: str) -> str:
@@ -90,6 +90,32 @@ def read_gct(path: str,
 
     return sample_df, annots_df
 
+def filter_dups(group:pd.DataFrame):
+    """
+    Meant to be used in apply after a groupby call.
+    For a set of rows (group) find the one with the lowest number of NaNs or tied for the lowest number.
+    group: pd.DataFrame
+    """
+    nan_counts = group.apply(lambda r: r.isnull().sum(), axis=1)
+    min_nan_counts = nan_counts.min()
+    return group.loc[nan_counts == min_nan_counts].head(1)
+
+# def deduplicate_rows(
+#     df: pd.DataFrame,
+#     samples,
+#     sequence_col = 'sequence',
+#     maximize_cols = ['bestScore', 'Best_scoreVML', 'bestDeltaForwardReverseScore']
+#     ) -> pd.DataFrame:
+
+#     groupby = df.index.names
+#     df = df.reset_index()
+
+#     df[maximize_cols] = df[maximize_cols].astype(float)
+#     df['numNAs'] = df[samples].isnull().sum(axis=1)
+#     if sequence_col:
+#         df['len'] = df[sequence_col].str.len()
+#         return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs','len'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True)
+#     return df.groupby(groupby).apply(lambda row: row.nsmallest(1, columns=['numNAs'], keep='all').nlargest(1, columns=maximize_cols, keep='first')).reset_index(level=-1, drop=True)
 
 
 def read_annotation(file_path: str) -> DataFrame: