Merge pull request #34 from hgb-bin-proteomics/develop

Use biopython for fasta reading in xiNET and xiVIEW exporters
hgb-bin-proteomics · Jan 21, 2025 · 81222d8 · 81222d8
2 parents 389afed + b3c8fb9
commit 81222d8
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -97,7 +97,7 @@ xiNetExporter_msannika.py f [f ...]
                             [--version]
 positional arguments:
   f                     MS Annika crosslink result files in Microsoft Excel
-                        format (.xlsx) to process.
+                        format (.xlsx) to process. Decoys should be excluded!
 required arguments:
   -fasta FASTAFILE, --fasta FASTAFILE
                         Fasta file used for crosslink search. Must contain
@@ -139,7 +139,7 @@ xiViewExporter_msannika.py f [f ...]
                              [--version]
 positional arguments:
   f                     MS Annika crosslink result files in Microsoft Excel
-                        format (.xlsx) to process.
+                        format (.xlsx) to process. Decoys should be excluded!
 required arguments:
   -fasta FASTAFILE, --fasta FASTAFILE
                         Fasta file used for crosslink search. Must contain

diff --git a/binaries/windows/xiNetExporter_msannika.exe b/binaries/windows/xiNetExporter_msannika.exe
diff --git a/binaries/windows/xiViewExporter_msannika.exe b/binaries/windows/xiViewExporter_msannika.exe
diff --git a/xiNetExporter_msannika.py b/xiNetExporter_msannika.py
@@ -7,10 +7,11 @@
 
 import argparse
 import pandas as pd
+from Bio import SeqIO
 from typing import List
 
-__version = "1.0.0"
-__date = "20221018"
+__version = "1.1.0"
+__date = "20250121"
 
 """
 DESCRIPTION:
@@ -24,7 +25,7 @@
                             [--version]
 positional arguments:
   f                     MS Annika crosslink result files in Microsoft Excel
-                        format (.xlsx) to process.
+                        format (.xlsx) to process. Decoys should be excluded!
 required arguments:
   -fasta FASTAFILE, --fasta FASTAFILE
                         Fasta file used for crosslink search. Must contain
@@ -52,20 +53,13 @@ def __init__(self, input_files: List[str], fasta_file: str, ignore_list: List[st
         # read fasta file
         sequences = dict()
 
-        with open(fasta_file, "r", encoding = "utf-8") as f:
-            fasta_data = f.read()
-            f.close()
-
-        for entry in fasta_data.split(">"):
-            lines = entry.split("\n")
-            if lines[0].strip() != "":
-                description = lines[0].strip()
-                identifier = lines[0].strip().split("|")[1].strip()
-                sequence = "".join([x.strip() for x in lines[1:]])
-                if identifier not in sequences:
-                    sequences[identifier] = {"description": description, "sequence": sequence}
-                else:
-                    print("WARNING: Identifier " + identifier + " is not unique!")
+        for entry in SeqIO.parse(fasta_file, "fasta"):
+            identifier = str(entry.id).split("|")[1].strip()
+            sequence = str(entry.seq)
+            if identifier not in sequences:
+                sequences[identifier] = {"sequence": sequence}
+            else:
+                print("WARNING: Identifier " + identifier + " is not unique!")
 
         self.database = sequences
 
@@ -136,25 +130,15 @@ def __generate_csv_df(self) -> pd.DataFrame:
 
         return result
 
-    def __generate_fasta_str(self) -> str:
-        fasta_str = ""
-        for key in self.database:
-            fasta_str = fasta_str + ">" + key + " " + self.database[key]["description"] + "\n" + self.database[key]["sequence"] + "\n"
-        return fasta_str
-
     # export function, takes one argument "output_file" which sets the prefix
     # of generated output files
     def export(self, output_file = None, format = "xiNET") -> None:
         csv = self.__generate_csv_df()
-        fasta = self.__generate_fasta_str()
 
         if output_file == None:
             output_file = self.input_files[0].split(".")[0]
 
         csv.to_csv(output_file + ".csv", index = False)
-        with open(output_file + ".fasta", "w", encoding = "utf-8") as f:
-            f.write(fasta)
-            f.close()
 
 # initialize exporter and export xiNET files
 def main() -> None:

diff --git a/xiViewExporter_msannika.py b/xiViewExporter_msannika.py
@@ -7,10 +7,11 @@
 
 import argparse
 import pandas as pd
+from Bio import SeqIO
 from typing import List
 
-__version = "1.0.0"
-__date = "20221020"
+__version = "1.1.0"
+__date = "20250116"
 
 """
 DESCRIPTION:
@@ -24,7 +25,7 @@
                              [--version]
 positional arguments:
   f                     MS Annika crosslink result files in Microsoft Excel
-                        format (.xlsx) to process.
+                        format (.xlsx) to process. Decoys should be excluded!
 required arguments:
   -fasta FASTAFILE, --fasta FASTAFILE
                         Fasta file used for crosslink search. Must contain
@@ -52,20 +53,13 @@ def __init__(self, input_files: List[str], fasta_file: str, ignore_list: List[st
         # read fasta file
         sequences = dict()
 
-        with open(fasta_file, "r", encoding = "utf-8") as f:
-            fasta_data = f.read()
-            f.close()
-
-        for entry in fasta_data.split(">"):
-            lines = entry.split("\n")
-            if lines[0].strip() != "":
-                description = lines[0].strip()
-                identifier = lines[0].strip().split("|")[1].strip()
-                sequence = "".join([x.strip() for x in lines[1:]])
-                if identifier not in sequences:
-                    sequences[identifier] = {"description": description, "sequence": sequence}
-                else:
-                    print("WARNING: Identifier " + identifier + " is not unique!")
+        for entry in SeqIO.parse(fasta_file, "fasta"):
+            identifier = str(entry.id).split("|")[1].strip()
+            sequence = str(entry.seq)
+            if identifier not in sequences:
+                sequences[identifier] = {"sequence": sequence}
+            else:
+                print("WARNING: Identifier " + identifier + " is not unique!")
 
         self.database = sequences
 
@@ -135,25 +129,15 @@ def __generate_csv_df(self) -> pd.DataFrame:
 
         return result
 
-    def __generate_fasta_str(self) -> str:
-        fasta_str = ""
-        for key in self.database:
-            fasta_str = fasta_str + ">" + self.database[key]["description"] + "\n" + self.database[key]["sequence"] + "\n"
-        return fasta_str
-
     # export function, takes one argument "output_file" which sets the prefix
     # of generated output files
     def export(self, output_file = None, format = "xiVIEW") -> None:
         csv = self.__generate_csv_df()
-        fasta = self.__generate_fasta_str()
 
         if output_file == None:
             output_file = self.input_files[0].split(".")[0]
 
         csv.to_csv(output_file + ".csv", index = False)
-        with open(output_file + ".fasta", "w", encoding = "utf-8") as f:
-            f.write(fasta)
-            f.close()
 
 # initialize exporter and export xiVIEW files
 def main() -> None: