diff --git a/sat/sat.py b/sat/sat.py index dd91dad..4252d14 100755 --- a/sat/sat.py +++ b/sat/sat.py @@ -1802,6 +1802,27 @@ def main(): will be left blank. """, ) + parser_aln_parse_dali.add_argument( + "-L", + "--qlen_sheet", + type=str, + required=False, + default="", + help=""" + DALI doesn't report query len. Thus, if there is no self alignment, it is + impossible to determine query len. This allows you to input a file of proteins + and their associated length. + + Path to a file containing protein names and their associated amino acid sequence + lengths. The first column should be the protein name, second column the length, + and the file should be tab-delimited with one protein per line. If a protein's + length is not present here, the script can determine qlen if there is a self + alignment in the alignment file. Otherwise, qlen will be kept as 0. + + In this file, each file name can be the basename (with no .fasta or .pdb), + or you can keep the .pdb or .fasta suffix. + """, + ) parser_aln_parse_dali.set_defaults(func=call_aln_parse_dali_main) # -------------------------------------------------------------------------------- # diff --git a/sat/scripts/aln_parse_dali.py b/sat/scripts/aln_parse_dali.py index f5b2237..8027fd9 100644 --- a/sat/scripts/aln_parse_dali.py +++ b/sat/scripts/aln_parse_dali.py @@ -1,4 +1,4 @@ -from .utils.misc import talk_to_me, make_output_dir +from .utils.misc import talk_to_me, make_output_dir, remove_pdb_and_fasta_suffix def parse_structure_key(structure_key_file, delim=",,", existing_dict=dict()): @@ -53,7 +53,6 @@ def aln_parse_dali_main(args): talk_to_me("Parsing the alignment file.") query_id = "" query = "" - qlen = 0 out = [ "query", "target", @@ -69,6 +68,18 @@ def aln_parse_dali_main(args): ] out = "\t".join(out) + "\n" + # Parse the qlen sheet, if specified. Here, users can add a tab delimited + # file with protein[tab]length. Of course, each DALI file will have a single query + # so a single query length, but this lets you input many at ones. + len_dict = dict() + if args.qlen_sheet != "": + talk_to_me("Parsing the qlen sheet.") + with open(args.qlen_sheet) as infile: + for line in infile: + line = line.rstrip("\n").split("\t") + query_base = remove_pdb_and_fasta_suffix(line[0]) + len_dict[query_base] = line[1] + with open(args.alignment_file) as infile: for line in infile: @@ -97,6 +108,7 @@ def aln_parse_dali_main(args): msg = f"Cannot find the query_id, {query_id}, in structure_key!" msg += " Continuing anyway." talk_to_me(msg) + query = remove_pdb_and_fasta_suffix(query) continue # Find the colnames and the other header row @@ -117,7 +129,7 @@ def aln_parse_dali_main(args): # Removing the chain identifier (-A for chain A, etc) target_id = chain[:-2] try: - target = structure_key[target_id] + target = remove_pdb_and_fasta_suffix(structure_key[target_id]) except KeyError: msg = f"Cannot find the target_id, {target_id}, in structure_key!" msg += " Continuing anyway." @@ -130,9 +142,16 @@ def aln_parse_dali_main(args): tlen = float(tlen) pident = float(pident) - # Update qlen if we get a self alignment - if query_id == target_id: + # Solve for qlen. + # ------------------------# + # If there is a qlen_sheet, lets look up the qlen. Either way, we + # Will check for self alignments afterwards. + if query in len_dict: + qlen = int(len_dict[query]) + elif query_id == target_id: qlen = tlen + else: + qlen = 0 # Generate a "cov" column that is alnlen/(max(qlen, tlen)). If there is no # labeled qlen, it will just end up alnlen/tlen. diff --git a/sat/scripts/utils/misc.py b/sat/scripts/utils/misc.py index 20bc5e1..3282071 100644 --- a/sat/scripts/utils/misc.py +++ b/sat/scripts/utils/misc.py @@ -45,6 +45,18 @@ def arg_str2bool(v): raise argparse.ArgumentTypeError("Boolean value expected.") +def remove_pdb_and_fasta_suffix(name: str) -> str: + """ + Removes .pdb or .fasta from a file name. + + Examples: + hahaha.ererere.fasta --> hahaha.ererere + hahaha.ererere.pdb --> hahaha.ererere + hahaha.ererere --> hahaha.ererere + """ + return name.rstrip(".fasta").rstrip(".pdb") + + # ------------------------------------------------------------------------------------ # # Read/write # ------------------------------------------------------------------------------------ #