Add functionality to input query lengths to dali.

jnoms · Jun 4, 2024 · 17377e9 · 17377e9
1 parent 0ea4a15
commit 17377e9
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 5 deletions.
diff --git a/sat/sat.py b/sat/sat.py
@@ -1802,6 +1802,27 @@ def main():
         will be left blank.
         """,
     )
+    parser_aln_parse_dali.add_argument(
+        "-L",
+        "--qlen_sheet",
+        type=str,
+        required=False,
+        default="",
+        help="""
+        DALI doesn't report query len. Thus, if there is no self alignment, it is
+        impossible to determine query len. This allows you to input a file of proteins
+        and their associated length.
+        
+        Path to a file containing protein names and their associated amino acid sequence
+        lengths. The first column should be the protein name, second column the length,
+        and the file should be tab-delimited with one protein per line. If a protein's
+        length is not present here, the script can determine qlen if there is a self
+        alignment in the alignment file. Otherwise, qlen will be kept as 0.
+
+        In this file, each file name can be the basename (with no .fasta or .pdb),
+        or you can keep the .pdb or .fasta suffix.
+        """,
+    )
     parser_aln_parse_dali.set_defaults(func=call_aln_parse_dali_main)
 
     # -------------------------------------------------------------------------------- #

diff --git a/sat/scripts/aln_parse_dali.py b/sat/scripts/aln_parse_dali.py
@@ -1,4 +1,4 @@
-from .utils.misc import talk_to_me, make_output_dir
+from .utils.misc import talk_to_me, make_output_dir, remove_pdb_and_fasta_suffix
 
 
 def parse_structure_key(structure_key_file, delim=",,", existing_dict=dict()):
@@ -53,7 +53,6 @@ def aln_parse_dali_main(args):
     talk_to_me("Parsing the alignment file.")
     query_id = ""
     query = ""
-    qlen = 0
     out = [
         "query",
         "target",
@@ -69,6 +68,18 @@ def aln_parse_dali_main(args):
     ]
     out = "\t".join(out) + "\n"
 
+    # Parse the qlen sheet, if specified. Here, users can add a tab delimited
+    # file with protein[tab]length. Of course, each DALI file will have a single query
+    # so a single query length, but this lets you input many at ones.
+    len_dict = dict()
+    if args.qlen_sheet != "":
+        talk_to_me("Parsing the qlen sheet.")
+        with open(args.qlen_sheet) as infile:
+            for line in infile:
+                line = line.rstrip("\n").split("\t")
+                query_base = remove_pdb_and_fasta_suffix(line[0])
+                len_dict[query_base] = line[1]
+
     with open(args.alignment_file) as infile:
 
         for line in infile:
@@ -97,6 +108,7 @@ def aln_parse_dali_main(args):
                         msg = f"Cannot find the query_id, {query_id}, in structure_key!"
                         msg += " Continuing anyway."
                         talk_to_me(msg)
+                query = remove_pdb_and_fasta_suffix(query)
                 continue
 
             # Find the colnames and the other header row
@@ -117,7 +129,7 @@ def aln_parse_dali_main(args):
             # Removing the chain identifier (-A for chain A, etc)
             target_id = chain[:-2]
             try:
-                target = structure_key[target_id]
+                target = remove_pdb_and_fasta_suffix(structure_key[target_id])
             except KeyError:
                 msg = f"Cannot find the target_id, {target_id}, in structure_key!"
                 msg += " Continuing anyway."
@@ -130,9 +142,16 @@ def aln_parse_dali_main(args):
             tlen = float(tlen)
             pident = float(pident)
 
-            # Update qlen if we get a self alignment
-            if query_id == target_id:
+            # Solve for qlen.
+            # ------------------------#
+            # If there is a qlen_sheet, lets look up the qlen. Either way, we
+            # Will check for self alignments afterwards.
+            if query in len_dict:
+                qlen = int(len_dict[query])
+            elif query_id == target_id:
                 qlen = tlen
+            else:
+                qlen = 0
 
             # Generate a "cov" column that is alnlen/(max(qlen, tlen)). If there is no
             # labeled qlen, it will just end up alnlen/tlen.

diff --git a/sat/scripts/utils/misc.py b/sat/scripts/utils/misc.py
@@ -45,6 +45,18 @@ def arg_str2bool(v):
         raise argparse.ArgumentTypeError("Boolean value expected.")
 
 
+def remove_pdb_and_fasta_suffix(name: str) -> str:
+    """
+    Removes .pdb or .fasta from a file name.
+
+    Examples:
+    hahaha.ererere.fasta --> hahaha.ererere
+    hahaha.ererere.pdb --> hahaha.ererere
+    hahaha.ererere --> hahaha.ererere
+    """
+    return name.rstrip(".fasta").rstrip(".pdb")
+
+
 # ------------------------------------------------------------------------------------ #
 # Read/write
 # ------------------------------------------------------------------------------------ #