Skip to content

Commit

Permalink
Add functionality to input query lengths to dali.
Browse files Browse the repository at this point in the history
  • Loading branch information
jnoms committed Jun 4, 2024
1 parent 0ea4a15 commit 17377e9
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 5 deletions.
21 changes: 21 additions & 0 deletions sat/sat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1802,6 +1802,27 @@ def main():
will be left blank.
""",
)
parser_aln_parse_dali.add_argument(
"-L",
"--qlen_sheet",
type=str,
required=False,
default="",
help="""
DALI doesn't report query len. Thus, if there is no self alignment, it is
impossible to determine query len. This allows you to input a file of proteins
and their associated length.
Path to a file containing protein names and their associated amino acid sequence
lengths. The first column should be the protein name, second column the length,
and the file should be tab-delimited with one protein per line. If a protein's
length is not present here, the script can determine qlen if there is a self
alignment in the alignment file. Otherwise, qlen will be kept as 0.
In this file, each file name can be the basename (with no .fasta or .pdb),
or you can keep the .pdb or .fasta suffix.
""",
)
parser_aln_parse_dali.set_defaults(func=call_aln_parse_dali_main)

# -------------------------------------------------------------------------------- #
Expand Down
29 changes: 24 additions & 5 deletions sat/scripts/aln_parse_dali.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .utils.misc import talk_to_me, make_output_dir
from .utils.misc import talk_to_me, make_output_dir, remove_pdb_and_fasta_suffix


def parse_structure_key(structure_key_file, delim=",,", existing_dict=dict()):
Expand Down Expand Up @@ -53,7 +53,6 @@ def aln_parse_dali_main(args):
talk_to_me("Parsing the alignment file.")
query_id = ""
query = ""
qlen = 0
out = [
"query",
"target",
Expand All @@ -69,6 +68,18 @@ def aln_parse_dali_main(args):
]
out = "\t".join(out) + "\n"

# Parse the qlen sheet, if specified. Here, users can add a tab delimited
# file with protein[tab]length. Of course, each DALI file will have a single query
# so a single query length, but this lets you input many at ones.
len_dict = dict()
if args.qlen_sheet != "":
talk_to_me("Parsing the qlen sheet.")
with open(args.qlen_sheet) as infile:
for line in infile:
line = line.rstrip("\n").split("\t")
query_base = remove_pdb_and_fasta_suffix(line[0])
len_dict[query_base] = line[1]

with open(args.alignment_file) as infile:

for line in infile:
Expand Down Expand Up @@ -97,6 +108,7 @@ def aln_parse_dali_main(args):
msg = f"Cannot find the query_id, {query_id}, in structure_key!"
msg += " Continuing anyway."
talk_to_me(msg)
query = remove_pdb_and_fasta_suffix(query)
continue

# Find the colnames and the other header row
Expand All @@ -117,7 +129,7 @@ def aln_parse_dali_main(args):
# Removing the chain identifier (-A for chain A, etc)
target_id = chain[:-2]
try:
target = structure_key[target_id]
target = remove_pdb_and_fasta_suffix(structure_key[target_id])
except KeyError:
msg = f"Cannot find the target_id, {target_id}, in structure_key!"
msg += " Continuing anyway."
Expand All @@ -130,9 +142,16 @@ def aln_parse_dali_main(args):
tlen = float(tlen)
pident = float(pident)

# Update qlen if we get a self alignment
if query_id == target_id:
# Solve for qlen.
# ------------------------#
# If there is a qlen_sheet, lets look up the qlen. Either way, we
# Will check for self alignments afterwards.
if query in len_dict:
qlen = int(len_dict[query])
elif query_id == target_id:
qlen = tlen
else:
qlen = 0

# Generate a "cov" column that is alnlen/(max(qlen, tlen)). If there is no
# labeled qlen, it will just end up alnlen/tlen.
Expand Down
12 changes: 12 additions & 0 deletions sat/scripts/utils/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@ def arg_str2bool(v):
raise argparse.ArgumentTypeError("Boolean value expected.")


def remove_pdb_and_fasta_suffix(name: str) -> str:
"""
Removes .pdb or .fasta from a file name.
Examples:
hahaha.ererere.fasta --> hahaha.ererere
hahaha.ererere.pdb --> hahaha.ererere
hahaha.ererere --> hahaha.ererere
"""
return name.rstrip(".fasta").rstrip(".pdb")


# ------------------------------------------------------------------------------------ #
# Read/write
# ------------------------------------------------------------------------------------ #
Expand Down

0 comments on commit 17377e9

Please sign in to comment.