Skip to content

Commit

Permalink
Add new subcommand to parse dali all-by-all matrix and dendograms
Browse files Browse the repository at this point in the history
  • Loading branch information
jnoms committed Apr 13, 2024
1 parent d44a257 commit fc4f5e8
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 0 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,13 @@ The output file is a .m8 file (e.g. tab delimited) and has the following columns
<!-- RICH-CODEX hide_command: true -->
![`poetry run .github/tmp/sat_codex.py aln_parse_dali -h`](.github/img/aln_parse_dali.png)

# SAT aln_parse_dali_matrix
This subcommand takes in a DALI matrix file and/or a DALI dendogram files, and uses the
specified key to convert each ID to its proper name.
<!-- RICH-CODEX hide_command: true -->
![`poetry run .github/tmp/sat_codex.py aln_parse_dali_matrix -h`](.github/img/aln_parse_dali_matrix.png)



# SAT aln_merge
This subcommand is used to merge two foldseek alignment files.
Expand Down
71 changes: 71 additions & 0 deletions sat/sat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1726,6 +1726,71 @@ def main():
)
parser_aln_parse_dali.set_defaults(func=call_aln_parse_dali_main)

# -------------------------------------------------------------------------------- #
# Parser for aln_parse_dali subcommand
# -------------------------------------------------------------------------------- #
parser_aln_parse_dali_matrix = subparsers.add_parser(
"aln_parse_dali_matrix",
help=(
"""
This subcommand takes in a DALI matrix file and/or a DALI dendogram files,
and uses the specified key to convert each ID to its proper name.
"""
),
)
parser_aln_parse_dali_matrix.add_argument(
"-k",
"--key",
type=str,
required=True,
help="""
Path to a tab-delimited file of format structure_name,,ID, where
the ID is a 4-digit identifier used during the DALI alignment. This lets you
convert the identifiers back.
""",
)
parser_aln_parse_dali_matrix.add_argument(
"-t",
"--tree",
type=str,
required=False,
default="",
help="""
Path to the DALI tree file (ends in newick or newick_unrooted).
""",
)
parser_aln_parse_dali_matrix.add_argument(
"-T",
"--tree_out",
type=str,
required=False,
default="",
help="""
Path to the output tree file.
""",
)
parser_aln_parse_dali_matrix.add_argument(
"-m",
"--matrix",
type=str,
required=False,
default="",
help="""
Path to the DALI similarity file.
""",
)
parser_aln_parse_dali_matrix.add_argument(
"-M",
"--matrix_out",
type=str,
required=False,
default="",
help="""
Path to the output, formatted matrix file.
""",
)
parser_aln_parse_dali_matrix.set_defaults(func=call_aln_parse_dali_matrix_main)

# -------------------------------------------------------------------------------- #
# Parser for aln_merge subcommand
# -------------------------------------------------------------------------------- #
Expand Down Expand Up @@ -2261,6 +2326,12 @@ def call_aln_parse_dali_main(args):
aln_parse_dali_main(args)


def call_aln_parse_dali_matrix_main(args):
from scripts.aln_parse_dali_matrix import aln_parse_dali_matrix_main

aln_parse_dali_matrix_main(args)


def call_aln_merge_main(args):
from scripts.aln_merge import aln_merge_main

Expand Down
107 changes: 107 additions & 0 deletions sat/scripts/aln_parse_dali_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import re

from .utils.misc import talk_to_me, make_output_dir


def validate_args(args):
if args.tree == "" and args.matrix == "":
msg = "Must specify --tree and/or --matrix."
raise ValueError(msg)
if args.tree != "" and args.tree_out == "":
msg = "You specified --tree, so you must specify --tree_out!"
raise ValueError(msg)
if args.matrix != "" and args.matrix_out == "":
msg = "You specified --matrix, so you must specify --matrix_out!"
raise ValueError(msg)


def parse_key(key_path):
key = dict()
with open(key_path) as infile:
for line in infile:
line = line.rstrip("\n").split(",,")
key[line[1]] = line[0].rstrip(".pdb")
return key


def format_nwk(nwk, key):

# Regex to find potential keys (assuming keys end with a letter followed by a colon and a number)
pattern = re.compile(r"(\w+):\d+(\.\d+)?")
keys_found = set(pattern.findall(nwk))
keys_found = {k[0] for k in keys_found}

# Replace keys in the nwk string if the base key is in the dictionary
for full_key in keys_found:
base_key = full_key[:-1]
nwk = nwk.replace(full_key, key[base_key])

return nwk


def parse_matrix(matrix_path):
"""
Parses the matrix file as a list of lists
"""
matrix = []
first_line = True
with open(matrix_path) as infile:
for line in infile:
# Ignore the first line, which is an integer
if first_line:
first_line = False
continue
row = line.split("\t")
matrix.append(row)

return matrix


def format_matrix(matrix, key):
"""
matrix is a list of lists, where each list is a row. The first item in each row
should be the ID. This script will convert the ID using the key.
"""

# Replace the IDs
for row in matrix:
row[0] = key[row[0][:-1]]
return matrix


def aln_parse_dali_matrix_main(args):
validate_args(args)

talk_to_me("Parsing key")
key = parse_key(args.key)

if args.tree != "":

talk_to_me("Processing tree")

with open(args.tree) as infile:
nwk = infile.read()
formatted_nwk = format_nwk(nwk, key)

make_output_dir(args.tree_out)
with open(args.tree_out, "w") as outfile:
outfile.write(formatted_nwk)

if args.matrix != "":

talk_to_me("Processing matrix")
matrix = parse_matrix(args.matrix)
matrix = format_matrix(matrix, key)

make_output_dir(args.matrix_out)
with open(args.matrix_out, "w") as outfile:
out = ""
for row in matrix:
row = "\t".join(row)
out += row
outfile.write(out)


if __name__ == "__main__":
msg = "Call this script from sat.py, where there is argument parsing."
raise ValueError(msg)

0 comments on commit fc4f5e8

Please sign in to comment.