Skip to content

Commit

Permalink
Merge pull request #3 from hgb-bin-proteomics/develop
Browse files Browse the repository at this point in the history
Optimize FDR calculation
  • Loading branch information
michabirklbauer authored Jan 24, 2024
2 parents bb18f34 + 5f5871d commit fd6e075
Showing 1 changed file with 20 additions and 24 deletions.
44 changes: 20 additions & 24 deletions msannika_fdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
# micha.birklbauer@gmail.com

# version tracking
__version = "1.0.0"
__date = "2024-01-09"
__version = "1.1.0"
__date = "2024-01-24"

# REQUIREMENTS
# pip install numpy
# pip install pandas
# pip install openpyxl

Expand Down Expand Up @@ -41,6 +42,7 @@
######################

import argparse
import numpy as np
import pandas as pd

from typing import List
Expand Down Expand Up @@ -138,20 +140,17 @@ class MSAnnika_CSM_Validator:
def get_class(row: pd.Series) -> str:
return "Decoy" if "D" in row["Alpha T/D"] or "D" in row["Beta T/D"] else "Target"

@staticmethod
def get_fdr(data: pd.DataFrame, score: float) -> float:

df = data[data["Combined Score"] > score].copy()
df["Class"] = df.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1)

return df[df["Class"] == "Decoy"].shape[0] / df[df["Class"] == "Target"].shape[0]

@staticmethod
def get_cutoff(data: pd.DataFrame, fdr: float) -> float:

data["Class"] = data.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1)
data["Class_label"] = data.apply(lambda row: 0 if row["Class"] == "Target" else 1, axis = 1)
labels = data["Class_label"].to_numpy()
labels_sorted = labels[data["Combined Score"].to_numpy().argsort()]

scores = sorted(data["Combined Score"].tolist())
for score in scores:
if MSAnnika_CSM_Validator.get_fdr(data, score) < fdr:
for i, score in enumerate(scores):
if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr:
return score

return scores[0]
Expand All @@ -160,7 +159,7 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float:
def validate(data: pd.DataFrame, fdr: float) -> pd.DataFrame:

cutoff = MSAnnika_CSM_Validator.get_cutoff(data, fdr)
df = data[data["Combined Score"] > cutoff].copy()
df = data[data["Combined Score"] >= cutoff].copy()

if "Confidence" not in df.columns:
return df
Expand All @@ -175,20 +174,17 @@ class MSAnnika_Crosslink_Validator:
def get_class(row: pd.Series) -> str:
return "Decoy" if row["Decoy"] else "Target"

@staticmethod
def get_fdr(data: pd.DataFrame, score: float) -> float:

df = data[data["Best CSM Score"] > score].copy()
df["Class"] = df.apply(lambda row: MSAnnika_Crosslink_Validator.get_class(row), axis = 1)

return df[df["Class"] == "Decoy"].shape[0] / df[df["Class"] == "Target"].shape[0]

@staticmethod
def get_cutoff(data: pd.DataFrame, fdr: float) -> float:

data["Class"] = data.apply(lambda row: MSAnnika_Crosslink_Validator.get_class(row), axis = 1)
data["Class_label"] = data.apply(lambda row: 0 if row["Class"] == "Target" else 1, axis = 1)
labels = data["Class_label"].to_numpy()
labels_sorted = labels[data["Best CSM Score"].to_numpy().argsort()]

scores = sorted(data["Best CSM Score"].tolist())
for score in scores:
if MSAnnika_Crosslink_Validator.get_fdr(data, score) < fdr:
for i, score in enumerate(scores):
if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr:
return score

return scores[0]
Expand All @@ -197,7 +193,7 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float:
def validate(data: pd.DataFrame, fdr: float) -> pd.DataFrame:

cutoff = MSAnnika_Crosslink_Validator.get_cutoff(data, fdr)
df = data[data["Best CSM Score"] > cutoff].copy()
df = data[data["Best CSM Score"] >= cutoff].copy()
df["Confidence"] = df.apply(lambda row: "High", axis = 1)

return df
Expand Down

0 comments on commit fd6e075

Please sign in to comment.