From 4f5484c42d3ea924a9fc8cb516e54f5701d19a03 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Fri, 22 Apr 2022 15:32:46 +0200 Subject: [PATCH 01/26] Add phylociraptor util and first module for downloading lineage information --- bin/get_lineage.py | 53 +++++++++++++++ phylociraptor | 156 +++++++++++++++++++++++++++++++-------------- 2 files changed, 160 insertions(+), 49 deletions(-) create mode 100755 bin/get_lineage.py diff --git a/bin/get_lineage.py b/bin/get_lineage.py new file mode 100755 index 0000000..d1cb59d --- /dev/null +++ b/bin/get_lineage.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import sys, time +from Bio import Entrez +import pandas as pd +Entrez.email = "philipp.resl@uni-graz.at" +if len(sys.argv) < 3: + print("Not enough arguments!") + print("usage:") + print("get_lineage.py ") + sys.exit(1) + +summary_file = open(sys.argv[1], "r") +outfilename = sys.argv[2] + +handle = Entrez.efetch(db="Taxonomy", id=310279, retmode="xml") +records = Entrez.read(handle) +#import code +#code.interact(local=locals()) +i=0 +ntries = 10 +all_lineage = {} +for line in summary_file: + if line.startswith("name"): + continue + i += 1 + taxid = line.split("\t")[7] + name = line.split("\t")[0] +# if name not in ["Pseudois_nayaur", "Merops_nubicus", "Melospiza_melodia", "Gallirallus_okinawae", "Buphagus_erythrorhynchus"]: +# continue + print("Retrieving lineage for ",name, taxid) + d = {} + ntry = 0 + while ntry <= ntries: + time.sleep(0.5) + handle = Entrez.efetch(db="Taxonomy", id=taxid, retmode="xml") + records = Entrez.read(handle) + if len(records) == 0: + print("Try ", ntry, " to access NCBI database failed. Will try again") + ntry += 1 + continue + else: + break + if ntry == ntries and len(records) == 0: + print("Failed to acccess NCBI for taxid: ",taxid, ". Giving up now") + continue + for info in records[0]["LineageEx"]: + d[info["Rank"]] = info["ScientificName"] + all_lineage[name] = d + +df=pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in all_lineage.items() ])) +df1 = df.T +df1.to_csv(outfilename, sep="\t",index=True, index_label="name") +#print(df1) diff --git a/phylociraptor b/phylociraptor index ef5b0a8..74424ed 100755 --- a/phylociraptor +++ b/phylociraptor @@ -6,61 +6,12 @@ import subprocess import argparse import glob -if sys.version_info[0] < 3: - raise Exception("Must be using Python 3") - exit(1) - - #some basic variables needed: try: with open(".version", "r") as file: version = file.readline().strip("\n") except FileNotFoundError: version = "unkown" -njobs = "10000" -latency_wait = "10" #in seconds -singularity_bindpoints = "-B $(pwd)/.usr_tmp/:/usertmp" -debug = False #turn debugging mode on (True) and off (False) -cluster_config_defaults= {"slurm": "data/cluster-config-SLURM.yaml.template", "sge":"data/cluster-config-SGE.yaml.template", "torque":"data/cluster-config-TORQUE.yaml.template"} -outfile_dict = { - "setup": ["results/checkpoints/modes/phylogenomics_setup.done"], - "orthology": ["results/checkpoints/modes/phylogenomics_setup.done"], - "filter-orthology": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done"], - "align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done"], - "filter-align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done"], - "speciestree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "njtree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "mltree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "modeltest": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "report": ["results/checkpoints/modes/phylogenomics_setup.done"] - } - -steps_to_check = ["setup", "orthology", "filter-orthology", "align", "filter-align", "njtree", "modeltest", "mltree", "speciestree"] -checkpoint_file_dict = { - "setup": "results/checkpoints/modes/phylogenomics_setup.done", - "orthology": "results/checkpoints/modes/orthology.done", - "filter-orthology": "results/checkpoints/modes/filter_orthology.done", - "align": "results/checkpoints/modes/align.done", - "filter-align": "results/checkpoints/modes/filter_align.done", - "speciestree": "results/checkpoints/modes/speciestree.done", - "njtree": "results/checkpoints/modes/njtree.done", - "mltree": "results/checkpoints/modes/trees.done", - "modeltest": "results/checkpoints/modes/modeltest.done" -} - - -outdir_dict = { - "setup": ["results/orthology/busco/busco_set", "results/assemblies", "results/downloaded_genomes"], - "orthology": ["results/orthology/busco"], - "align": ["results/alignments"], - "filter-orthology": ["results"], - "align": ["results"], - "filter-align": ["results/alignments/trimmed", "results/alignments/filtered"], - "speciestree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step - "njtree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step - "mltree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step - "modeltest": ["results/modeltest"] -} phylociraptor = """ Welcome to @@ -259,6 +210,68 @@ Argumemts: """ +util_help = """ +phylociraptor util - Utilities for a posteri analyses of phylociraptor results. + +Usage: phylociraptor util + +Argumemts: + get-lineage retreive full lineage information for all included samples + compare-trees visually compare multiple trees + plot-tree plot a tree + -h, --help Display help. + +""" + +if sys.version_info[0] < 3: + raise Exception("Must be using Python 3") + exit(1) + +njobs = "10000" +latency_wait = "10" #in seconds +singularity_bindpoints = "-B $(pwd)/.usr_tmp/:/usertmp" +debug = False #turn debugging mode on (True) and off (False) +cluster_config_defaults= {"slurm": "data/cluster-config-SLURM.yaml.template", "sge":"data/cluster-config-SGE.yaml.template", "torque":"data/cluster-config-TORQUE.yaml.template"} +outfile_dict = { + "setup": ["results/checkpoints/modes/phylogenomics_setup.done"], + "orthology": ["results/checkpoints/modes/phylogenomics_setup.done"], + "filter-orthology": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done"], + "align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done"], + "filter-align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done"], + "speciestree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "njtree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "mltree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "modeltest": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "report": ["results/checkpoints/modes/phylogenomics_setup.done"] + } + +steps_to_check = ["setup", "orthology", "filter-orthology", "align", "filter-align", "njtree", "modeltest", "mltree", "speciestree"] +checkpoint_file_dict = { + "setup": "results/checkpoints/modes/phylogenomics_setup.done", + "orthology": "results/checkpoints/modes/orthology.done", + "filter-orthology": "results/checkpoints/modes/filter_orthology.done", + "align": "results/checkpoints/modes/align.done", + "filter-align": "results/checkpoints/modes/filter_align.done", + "speciestree": "results/checkpoints/modes/speciestree.done", + "njtree": "results/checkpoints/modes/njtree.done", + "mltree": "results/checkpoints/modes/trees.done", + "modeltest": "results/checkpoints/modes/modeltest.done" +} + + +outdir_dict = { + "setup": ["results/orthology/busco/busco_set", "results/assemblies", "results/downloaded_genomes"], + "orthology": ["results/orthology/busco"], + "align": ["results/alignments"], + "filter-orthology": ["results"], + "align": ["results"], + "filter-align": ["results/alignments/trimmed", "results/alignments/filtered"], + "speciestree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step + "njtree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step + "mltree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step + "modeltest": ["results/modeltest"] +} + def now(): return time.strftime("%Y-%m-%d %H:%M") + " -" @@ -523,6 +536,13 @@ class PhyloParser(argparse.ArgumentParser): self.add_argument("--snakemake", action="store",dest="sm_args", default="") self.add_argument("--rerun-incomplete", action="store_true", dest="rerun", default=False) +class UtilParser(argparse.ArgumentParser): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.add_argument("-h", "--help", action="store_true") + self.add_argument("--verbose", action="store_true", default=False) + + if args.command == "setup": print(now(), "Welcome to phylociraptor setup v%s" % version) setup_parser = PhyloParser(usage=help_message(setup_help), add_help=False) @@ -1125,6 +1145,44 @@ elif args.command == "check": print(" ", mode," ...", '\033[92m', "DONE", '\033[0m') print("\nWARNING: phylociraptor check is just a quick and shallow verification of the run. In case you run into problems, please also check logfiles in the log directory for more in-depth diagnostics.") +elif args.command == "util": + print(now(), "Welcome to phylociraptor util v%s" % version) + #util_parser = argparse.ArgumentParser(add_help=False) + #util_parser.add_argument("-h", "--help", action="store_true") + #util_parser.add_argument("command", action="store", nargs="?") + #util_parser.add_argument("--verbose", action="store_true", default=False) + #util_args = util_parser.parse_args(args.arguments) + + if args.arguments[0] == "-h" or args.arguments[0] == "--help" or len(args.arguments) == 0: + print(help_message(util_help)) + sys.exit(0) + which_util = args.arguments.pop(0) + + if which_util == "get-lineage": + gl_parser = UtilParser(add_help=False) + gl_parser.add_argument("-g","--genomefile", action="store", default="results/statistics/downloaded_genomes_statistics.txt") + gl_parser.add_argument("-o", "--output", action="store") +# gl_parser.add_argument("-v", "--verbose", action="store_true", default=False) + gl_args = gl_parser.parse_args(args.arguments) + print(gl_args) + if gl_args.help: + print("help") + #print(help_message(getlineage_help)) + sys.exit(0) + if not gl_args.output: + print("You need to specify an output file with -o/--output") + sys.exit(1) + if not os.path.isfile(gl_args.genomefile): + print("Genome file not found") + sys.exit(1) + cmd = ["singularity", "exec", "docker://reslp/biopython_plus:1.77", "python", "bin/get_lineage.py", gl_args.genomefile, gl_args.output] + if debug: + print(cmd) + for line in execute_command(cmd, gl_args.verbose): + print(line, end="\r") + sys.exit(0) + + else: print("Runmode not recognized: %s" % args.command) print("Please run phylociraptor -h to see avilable options") From eb4e9f20fe3b6f61c2dbd9724bb795cfdfcf2e17 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Tue, 17 May 2022 13:58:41 +0200 Subject: [PATCH 02/26] Add compare_trees script to util --- bin/compare_trees.py | 341 +++++++++++++++++++++++++++++++++++++++++++ phylociraptor | 10 +- 2 files changed, 348 insertions(+), 3 deletions(-) create mode 100755 bin/compare_trees.py diff --git a/bin/compare_trees.py b/bin/compare_trees.py new file mode 100755 index 0000000..419df74 --- /dev/null +++ b/bin/compare_trees.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 + +#import dendropy as dp +import sys +#from ete3 import Tree +import time +from itertools import combinations, permutations, product +import random +import multiprocessing +import pandas as pd +import datetime +import argparse +import glob +import os +from rpy2.robjects.packages import importr +from rpy2 import robjects as ro + +def reduce_trees(treelist, quat): + sorted_trees = [] + i = 1 + for tree in treelist: + #try: + # tree = Tree(tree.retain_taxa_with_labels(quat).as_string(schema="newick")) + #tree2 = Tree(tree.as_string(schema="newick")) + #except dp.utility.error.SeedNodeDeletionException: + # print("SeedNodeDeletionException: Error trying to reduce the tree, will skip this quartet:") + # print("Tree:",treelist.index(tree)+1, quat) + # return 0 + #tree.rx[4] = ro.r("NULL") + tiplabels = list(tree.rx2("tip.label")) + + # # debug code: + # if not any(x in tiplabels for x in quat): + # print(quat, tiplabels) + # else: + # print("Quartet tips contained in tree.") + + tf_list = [] + for tip in tiplabels: + if tip in quat: + continue + else: + tf_list.append(tip) + tipss =ro.StrVector(tf_list) + redtree = ape.drop_tip(tree, tipss) + treestr2 = ape.write_tree(ape.drop_tip(tree, tipss)) + treestr = treestr2[0].strip(";") + #treestr = tree.write(format=9).strip(";") + #treestr = tree.prune(quat) + #print(tree2.write(format=9)) + if len(list(redtree.rx2("tip.label"))) != 4: + missing = [tip for tip in quat if tip not in list(redtree.rx2("tip.label"))] + print(quat, "is not properly represented in tree", i, "Missing tips:", missing) + tr = [el.split(")") for el in treestr.split("(") if el] + tt = [tip.lstrip(",").strip(",") for sublist in tr for tip in sublist if tip] + tt = [el.strip(",").strip(":") for el in tt] + sorted_tree = [] + tips = "" + second_element = 0 + for el in tt: + if "," in el: + sorted_tree.append(",".join(sorted(el.split(",")))) + else: + tips += el + if second_element == 1: + sorted_tree.append(",".join(sorted(tips.split(",")))) + if second_element == 0: + tips += "," + second_element = 1 + sorted_trees.append("-".join(sorted(sorted_tree))) + i += 1 +# return [Tree(t.extract_tree_with_taxa_labels(quat).as_string(schema="newick")) for t in treelist] + return sorted_trees + +def reformat_quat(quat): +# sorted(quat) + return quat[0] + "," + quat[1] + "-" + quat[2] + "," + quat[3] + +def compare_trees(treelist, combinations): + comparison = {} + for comb in combinations: + if treelist[comb[0]-1] == treelist[comb[1]-1]: + comparison["T" + str(comb[0]) + "-T" + str(comb[1])] = 1 + #print("MATCH:", comb, treelist[comb[0]], treelist[comb[1]]) + else: + comparison["T" + str(comb[0]) + "-T" + str(comb[1])] = 0 + #print("DIFFERENCE:", comb, treelist[comb[0]-1], treelist[comb[1]-1]) + return comparison + + +def quats_in_trees(tree, quat): + tstring = Tree(tree.extract_tree_with_taxa_labels(quat).as_string(schema="newick")) + tr = treestr.split("(") + tt = [tip.lstrip(",").strip(",") for tip in sublist if tip] + sorted_tree = [] + tips = "" + second_element = 0 + for el in tt: + if "," in el: + sorted_tree.append(",".join(sorted(el.split(",")))) + else: + tips += el + if second_element == 1: + sorted_tree.append(",".join(sorted(tips.split(",")))) + if second_element == 0: + tips += "," + second_element = 1 + + + sorted_trees.append("-".join(sorted(sorted_tree))) + quat_formated = quat[0] + "," +quat[1]+"-"+quat[2]+","+quat[3] + print(quat_formated) + print(sorted_trees) + + +def random_combination(iterable, r): + i = 0 + alltips = tuple(iterable) + ntips = len(alltips) + tipsrange = range(ntips) + while i < r: + i += 1 + yield [alltips[j] for j in random.sample(tipsrange, r)] + +nn = 1 +def compute_single_quat(tl,i, quat, combinations): + #print("Computing quat:", i, end="\r") + tl2 = reduce_trees(tl, quat) + if tl2 == 0: + return 0 +# print(tl2) +# reformatted_quat = reformat_quat(quat) +# result = float(compare_trees(tl2, reformatted_quat)) + result = compare_trees(tl2, combinations) + return (tl2[0], result) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog="compare_trees.py", description = """This script compares trees based on quartets""", epilog = """written by Philipp Resl""") + parser.add_argument("-i","--intrees", action="store", default="all") + parser.add_argument("-o", "--outfile", action="store", default="output") + parser.add_argument("-s", "--seed", action="store") + parser.add_argument("-t", "--threads", action="store") + parser.add_argument("-q", "--nquartets", action="store") + parser.add_argument("-l", "--lineagefile", action="store") + parser.add_argument("-b", "--stopby", action="store") + parser.add_argument("--selecttaxa", action="store") + args = parser.parse_args() + start = time.time() + # read in all parameters + if args.seed != "random": + seed = int(args.seed) +# seed = 111123 # here for testing + random.seed(seed) + print("Random seed:", seed) + outfile = args.outfile + if not args.threads: + ncpus = multiprocessing.cpu_count() + else: + ncpus = int(args.threads) + if args.intrees == "all": + iqtree_trees = glob.glob("results/phylogeny-*/iqtree/*/concat.contree") + raxml_trees = glob.glob("results/phylogeny-*/raxml/*/concat.tre") + astral_trees = glob.glob("results/phylogeny-*/astral/*/species_tree.tre") + #all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) + treenames = iqtree_trees +raxml_trees + astral_trees + + else: + #treenames = ["species_tree.tre", "concat.contree", "species_tree.tre2", "concat.contree2", "concat.contree3", "concat.contree3"] + treenames = args.intrees.split(",") + i = 1 + tl = [] + ape = importr('ape') + for name in treenames: + print("Tree", i, "->", name) + tree = ape.read_tree(name) + tree.rx[2] = [ro.r("NULL")] * len(tree.rx2("node.label")) + tree.rx[4] = [ro.r("NULL")] * len(tree.rx2("node.label")) + tl.append(tree) + #tl.append(dp.Tree.get(path=name, schema="newick")) + i += 1 + # create selection of tips to be analyzed: + all_tips = [tip for tree in tl for tip in list(tree.rx2("tip.label"))] + print("Total number of taxa in all trees:", len(set(all_tips))) + if args.selecttaxa: + if not args.lineagefile: + print("Taxon selection requires a lineage file") + sys.exit(0) + else: + if os.path.exists(args.lineagefile): + lineage_df = pd.read_csv(args.lineagefile, sep="\t") + rank, taxa = args.selecttaxa.split("=") + selected_tips = lineage_df.loc[lineage_df[rank].isin(taxa.split(",")), 'name'].to_list() + taxon_list = [tip for tip in selected_tips if all_tips.count(tip)] + else: + print("Lineage file does not exist") + sys.exit(0) + else: + taxon_list = [taxon for taxon in set(all_tips)] + #selected_tips = [tip for tree in tl for tip in list(tree.rx2("tip.label"))] + #taxon_list = [] + #for taxon in set(all_tips): #only use tips present in all trees to avoid spurious comparisons + # if all_tips.count(taxon) == len(tl): + # taxon_list += taxon + + random.shuffle(taxon_list) + + print("No. of trees:", len(tl)) + combinations = combinations(range(1, len(tl)+1), 2) + combinations = list(combinations) + results = 0.0 + i = 0 + print("No. of Taxa for calculating quartets:", len(taxon_list)) + print("Using", ncpus, "CPUs.") + quartet_list = [] + print("Calculating quartets, this may take some time...") + # decide how many quartets should be calculated + if args.nquartets: # 1. by specifying a max number + input_list=[] + nquartets = int(args.nquartets) + print("No. of quartets to be calculated:", nquartets) + for i in range(nquartets): + quartet = next(random_combination(taxon_list, 4)) + quartet_list += quartet + input_list.append([tl, i, quartet, combinations]) + pool = multiprocessing.Pool(ncpus) + results = pool.starmap_async(compute_single_quat, input_list) + final_results = results.get() + pool.close() + elif args.stopby: # 2. by stopping criterion (can still be extended) + which, num = args.stopby.split("=") + num = int(num) + if which == "conflicts": + print("Stopping criterion is number of conflicts:", num) + nconflicts = 0 + ncalculations = 0 + step = 100 + final_results = [] + while nconflicts < num: + input_list = [] + for i in range(step): + quartet = next(random_combination(taxon_list, 4)) + quartet_list += quartet + input_list.append([tl, i, quartet, combinations]) + pool = multiprocessing.Pool(ncpus) + results = pool.starmap_async(compute_single_quat, input_list) + intermed_results = results.get() + pool.close() + for result in intermed_results: + if any(v == 0 for v in list(result[1].values())): + nconflicts += 1 + final_results += intermed_results + ncalculations += step + print("Calculated quartets:", ncalculations, "- Identified conflicts:",nconflicts,"of",num) + nquartets = ncalculations + elif which == "tipcoverage": + print("Stopping criterion is tip coverage:", num) + samplingdepth = 0 + ncalculations = 0 + step = 100 + final_results = [] + while True: + input_list=[] + for i in range(step): + quartet = next(random_combination(taxon_list, 4)) + quartet_list += quartet + input_list.append([tl, i, quartet, combinations]) + pool = multiprocessing.Pool(ncpus) + results = pool.starmap_async(compute_single_quat, input_list) + intermed_results = results.get() + pool.close() + if not all(quartet_list.count(v) >= num for v in taxon_list): + depthlist = [quartet_list.count(tip) for tip in taxon_list] + samplingdepth = sum(depthlist) / len(depthlist) + final_results += intermed_results + ncalculations += step + print("Calculated quartets:", ncalculations, "- Times each tip was sampled (mean):",round(samplingdepth, 2)) + else: + break + nquartets = ncalculations + + else: + print("Stopping criterion not recognized.") + sys.exit(1) + else: + print("Either use a stopping criterion or a maximum number of quartets") + sys.exit(0) + print("Calculating quartets finished.\n") + print("Number of results:", len(final_results)) + results_dict = {} + i = 1 + for result in final_results: + if result != 0: + results_dict[result[0]] = result[1] + i += 1 + df = pd.DataFrame.from_dict(results_dict) + #print(df) + #conflicts= df.loc[:, df.sum() < len(tl)] + #print(conflicts) + print("Calculating sampling coverage of tips:") + with open(outfile +".tip_sampling_coverage.tsv", "w") as f: + for tip in taxon_list: + print(tip, quartet_list.count(tip), file=f) + + print("Saving results. Prefix:", outfile) + df.transpose().to_csv(outfile + ".quartets.csv", index_label="quartet", quoting=False) + #print("No. of quartets contributing to conflicts:", len(conflicts.columns.values.tolist())) + + print("Calculating pairwise comparison of trees (% similarity based on quartetts)...") + df_comp = df.sum(axis=1).sort_index()/nquartets + df_comp = df_comp.sort_index() + combinations = df_comp.index.to_list() + index_names = ["T"+ str(i+1) for i in range(0,len(tl))] + values = [1 for item in index_names] + df_dict = {} + for combination in index_names: + df_dict[combination] = values + df = pd.DataFrame(df_dict) + df.index = index_names + for tr1 in index_names: + for tr2 in index_names: + if "-".join([tr1, tr2]) in df_comp.index.to_list(): + df.loc[tr1, tr2] = df_comp.transpose()[tr1+"-"+tr2] + df.loc[tr2, tr1] = df_comp.transpose()[tr1+"-"+tr2] + df.to_csv(outfile + ".similarity_matrix.csv") + print("Writing list of trees...") + with open(outfile + ".treelist.tsv", "w") as f: + i = 1 + for tree in treenames: + print("T"+str(i), tree, file=f) + end = time.time() + print("Tree comparison is done.") + print("\nThe following output has been created:") + print(outfile+".quartets.csv contains the raw quartet occurences.") + print(outfile+".similarity_matrix.csv contains the % similarity between pairs of trees.") + print(outfile+".treelist.tsv contains all trees used and the corresponding tree number.") + print(outfile+".tip_sampling_coverage.tsv contains counts how often each tip was sampled.") + print() + print("Time to execute:", datetime.timedelta(seconds=round(end-start))) + + diff --git a/phylociraptor b/phylociraptor index 74424ed..e7f6e5d 100755 --- a/phylociraptor +++ b/phylociraptor @@ -1153,10 +1153,14 @@ elif args.command == "util": #util_parser.add_argument("--verbose", action="store_true", default=False) #util_args = util_parser.parse_args(args.arguments) - if args.arguments[0] == "-h" or args.arguments[0] == "--help" or len(args.arguments) == 0: + if len(args.arguments) == 0: print(help_message(util_help)) sys.exit(0) - which_util = args.arguments.pop(0) + elif args.arguments[0] == "-h" or args.arguments[0] == "--help": + print(help_message(util_help)) + sys.exit(0) + else: + which_util = args.arguments.pop(0) if which_util == "get-lineage": gl_parser = UtilParser(add_help=False) @@ -1164,7 +1168,7 @@ elif args.command == "util": gl_parser.add_argument("-o", "--output", action="store") # gl_parser.add_argument("-v", "--verbose", action="store_true", default=False) gl_args = gl_parser.parse_args(args.arguments) - print(gl_args) +# print(gl_args) if gl_args.help: print("help") #print(help_message(getlineage_help)) From b4b8e144f0dfde9c7efbc6072a25a5f22516c000 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Tue, 17 May 2022 13:59:19 +0200 Subject: [PATCH 03/26] phylociraptor: make compare-trees available in util --- phylociraptor | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/phylociraptor b/phylociraptor index e7f6e5d..7d29592 100755 --- a/phylociraptor +++ b/phylociraptor @@ -1166,7 +1166,7 @@ elif args.command == "util": gl_parser = UtilParser(add_help=False) gl_parser.add_argument("-g","--genomefile", action="store", default="results/statistics/downloaded_genomes_statistics.txt") gl_parser.add_argument("-o", "--output", action="store") -# gl_parser.add_argument("-v", "--verbose", action="store_true", default=False) + gl_parser.add_argument("--quiet", action="store_true", default=False) gl_args = gl_parser.parse_args(args.arguments) # print(gl_args) if gl_args.help: @@ -1182,9 +1182,56 @@ elif args.command == "util": cmd = ["singularity", "exec", "docker://reslp/biopython_plus:1.77", "python", "bin/get_lineage.py", gl_args.genomefile, gl_args.output] if debug: print(cmd) - for line in execute_command(cmd, gl_args.verbose): + for line in execute_command(cmd, not gl_args.quiet): print(line, end="\r") sys.exit(0) + if which_util == "compare-trees": + qs_parser = UtilParser(add_help=False) + qs_parser.add_argument("-i","--intrees", action="store", default="all") + qs_parser.add_argument("-o", "--outfile", action="store") + qs_parser.add_argument("-s", "--seed", action="store", default="random") + qs_parser.add_argument("-q", "--nquartets", action="store", default=None) + qs_parser.add_argument("-t", "--threads", action="store", default="1") + qs_parser.add_argument("-l", "--lineagefile", action="store") + qs_parser.add_argument("-b", "--stopby", action="store", default=None) + qs_parser.add_argument("--selecttaxa", action="store") + qs_parser.add_argument("--quiet", action="store_true", default=False) + qs_args = qs_parser.parse_args(args.arguments) +# print(gl_args) + if qs_args.help: + print("help") + sys.exit(0) + if not qs_args.outfile: + print(now(),"You need to specify an output file with -o/--outfile") + sys.exit(1) + if not qs_args.intrees: + print(now(),"No input trees specified") + sys.exit(1) + elif qs_args.intrees == "all": + print(now(), "Will compare all trees, this expects phylociraptor mltree and/or speciestree to be finished.") + iqtree_trees = glob.glob("results/phylogeny-*/iqtree/*/concat.contree") + raxml_trees = glob.glob("results/phylogeny-*/raxml/*/concat.tre") + astral_trees = glob.glob("results/phylogeny-*/astral/*/species_tree.tre") + all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) + else: + all_trees = qs_args.intrees + cmd = ["singularity", "exec", "docker://reslp/phylopy:2", "python", "bin/compare_trees.py", "-i", all_trees, "-o", qs_args.outfile, "-s", qs_args.seed, "-t", qs_args.threads] + + if qs_args.lineagefile and qs_args.selecttaxa: + cmd += ["-l", qs_args.lineagefile, "--selecttaxa", qs_args.selecttaxa] + if qs_args.nquartets: + cmd += ["-q", qs_args.nquartets] + elif qs_args.stopby: + cmd += ["-b", qs_args.stopby] + else: + print(now(), "You need to either specify a stopping criterion (-b) or the total number of quartets (-q).") + sys.exit(0) + + + if debug: + print(cmd) + for line in execute_command(cmd, not qs_args.quiet): + print(line, end="\r") else: From fbb5fabe07964fab06bc1fde1a98c4ca0b3bc69b Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Mon, 23 May 2022 21:31:26 +0200 Subject: [PATCH 04/26] util: Fix small bug compare_tree.py which lead to trees not being counted correctly --- bin/compare_trees.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/compare_trees.py b/bin/compare_trees.py index 419df74..5f29c9b 100755 --- a/bin/compare_trees.py +++ b/bin/compare_trees.py @@ -327,7 +327,8 @@ def compute_single_quat(tl,i, quat, combinations): with open(outfile + ".treelist.tsv", "w") as f: i = 1 for tree in treenames: - print("T"+str(i), tree, file=f) + print("T"+str(i), tree, file=f, sep="\t") + i += 1 end = time.time() print("Tree comparison is done.") print("\nThe following output has been created:") From 4c11e3613806353c5bea5436e9a5235189bc152a Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Tue, 24 May 2022 16:07:27 +0200 Subject: [PATCH 05/26] phylociraptor: add wd bindpoint to util calls to prevent singularity problems with phylociraptor-docker on Macs --- phylociraptor | 83 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/phylociraptor b/phylociraptor index 7d29592..82a0cac 100755 --- a/phylociraptor +++ b/phylociraptor @@ -211,14 +211,15 @@ Argumemts: """ util_help = """ -phylociraptor util - Utilities for a posteri analyses of phylociraptor results. +phylociraptor util - Utilities for a-posteriori analyses of phylociraptor results. Usage: phylociraptor util Argumemts: - get-lineage retreive full lineage information for all included samples - compare-trees visually compare multiple trees - plot-tree plot a tree + get-lineage retreive full lineage information for all included samples. + compare-trees compare trees based on the occurence of quartets. + plot-tree plot one or more trees. + plot-conflicts plot conflicts between two trees based on quartet comparison. -h, --help Display help. """ @@ -1179,7 +1180,7 @@ elif args.command == "util": if not os.path.isfile(gl_args.genomefile): print("Genome file not found") sys.exit(1) - cmd = ["singularity", "exec", "docker://reslp/biopython_plus:1.77", "python", "bin/get_lineage.py", gl_args.genomefile, gl_args.output] + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/biopython_plus:1.77", "python", "bin/get_lineage.py", gl_args.genomefile, gl_args.output] if debug: print(cmd) for line in execute_command(cmd, not gl_args.quiet): @@ -1215,7 +1216,7 @@ elif args.command == "util": all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) else: all_trees = qs_args.intrees - cmd = ["singularity", "exec", "docker://reslp/phylopy:2", "python", "bin/compare_trees.py", "-i", all_trees, "-o", qs_args.outfile, "-s", qs_args.seed, "-t", qs_args.threads] + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/phylopy:2", "python", "bin/compare_trees.py", "-i", all_trees, "-o", qs_args.outfile, "-s", qs_args.seed, "-t", qs_args.threads] if qs_args.lineagefile and qs_args.selecttaxa: cmd += ["-l", qs_args.lineagefile, "--selecttaxa", qs_args.selecttaxa] @@ -1232,8 +1233,76 @@ elif args.command == "util": print(cmd) for line in execute_command(cmd, not qs_args.quiet): print(line, end="\r") - + if which_util == "plot-tree": + qs_parser = UtilParser(add_help=False) + qs_parser.add_argument("-i","--intrees", action="store", default="all") + qs_parser.add_argument("-o", "--outprefix", action="store", default="tree-plot") + qs_parser.add_argument("-g", "--outgroup", action="store", default="none") + qs_parser.add_argument("-l", "--lineagefile", action="store", default="none") + qs_parser.add_argument("-e", "--level", action="store", default="none") + qs_parser.add_argument("-s", "--seed", action="store", default="random") + qs_parser.add_argument("--quiet", action="store_true", default=False) + qs_args = qs_parser.parse_args(args.arguments) + if qs_args.help: + print("help") + sys.exit(0) + if not qs_args.intrees: + print(now(),"No input trees specified") + sys.exit(1) + elif qs_args.intrees == "all": + print(now(), "Will compare all trees, this expects phylociraptor mltree and/or speciestree to be finished.") + iqtree_trees = glob.glob("results/phylogeny-*/iqtree/*/concat.contree") + raxml_trees = glob.glob("results/phylogeny-*/raxml/*/concat.tre") + astral_trees = glob.glob("results/phylogeny-*/astral/*/species_tree.tre") + all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) + else: + all_trees = qs_args.intrees + + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-tree.R"] + cmd += ["plot", os.getcwd(), qs_args.seed, all_trees, qs_args.outgroup, qs_args.lineagefile, qs_args.level, qs_args.outprefix] + + print(cmd) + if debug: + print(cmd) + for line in execute_command(cmd, not qs_args.quiet): + print(line, end="\r") + if which_util == "plot-conflicts": + qs_parser = UtilParser(add_help=False) + qs_parser.add_argument("-i","--intrees", action="store", default=None) + qs_parser.add_argument("-g", "--outgroup", action="store", default="none") + qs_parser.add_argument("-l", "--lineagefile", action="store", default="none") + qs_parser.add_argument("-q", "--quartetfile", action="store", default=None) + qs_parser.add_argument("-e", "--level", action="store", default="none") + qs_parser.add_argument("-s", "--seed", action="store", default="random") + qs_parser.add_argument("--treelist", action="store", default=None) + qs_parser.add_argument("--quiet", action="store_true", default=False) + qs_args = qs_parser.parse_args(args.arguments) + if qs_args.help: + print("help") + sys.exit(0) + if not qs_args.intrees: + print(now(),"No input trees specified") + sys.exit(0) + elif len(qs_args.intrees.split(",")) != 2: + print(now(), "Need exactly two trees for comparison, paths should be seperated by a comma (,).") + sys.exit(0) + else: + all_trees = qs_args.intrees + if not qs_args.quartetfile: + print(now(), "You need to provide a quartet conflict file.") + sys.exit(1) + if not qs_args.treelist: + print(now(), "You need to provide the treelist file.") + sys.exit(1) + + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-tree.R"] + cmd += ["conflicts", os.getcwd(), qs_args.seed, all_trees, qs_args.outgroup, qs_args.lineagefile, qs_args.level, qs_args.quartetfile, qs_args.treelist] + print(cmd) + if debug: + print(cmd) + for line in execute_command(cmd, not qs_args.quiet): + print(line, end="\r") else: print("Runmode not recognized: %s" % args.command) print("Please run phylociraptor -h to see avilable options") From 3f65b2248376650d277cefd724d310b538970fde Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 25 May 2022 11:09:52 +0200 Subject: [PATCH 06/26] Create phylociraptor_modules to as part of cleaning up code in phylociraptor script. Rename compare_tree.py script --- ...{compare_trees.py => estimate_conflict.py} | 0 bin/phylociraptor_modules/outfiles.py | 40 +++ bin/phylociraptor_modules/usageinfo.py | 306 ++++++++++++++++++ phylociraptor | 295 ++--------------- 4 files changed, 364 insertions(+), 277 deletions(-) rename bin/{compare_trees.py => estimate_conflict.py} (100%) create mode 100644 bin/phylociraptor_modules/outfiles.py create mode 100644 bin/phylociraptor_modules/usageinfo.py diff --git a/bin/compare_trees.py b/bin/estimate_conflict.py similarity index 100% rename from bin/compare_trees.py rename to bin/estimate_conflict.py diff --git a/bin/phylociraptor_modules/outfiles.py b/bin/phylociraptor_modules/outfiles.py new file mode 100644 index 0000000..79c6281 --- /dev/null +++ b/bin/phylociraptor_modules/outfiles.py @@ -0,0 +1,40 @@ + +outfile_dict = { + "setup": ["results/checkpoints/modes/phylogenomics_setup.done"], + "orthology": ["results/checkpoints/modes/phylogenomics_setup.done"], + "filter-orthology": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done"], + "align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done"], + "filter-align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done"], + "speciestree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "njtree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "mltree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "modeltest": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], + "report": ["results/checkpoints/modes/phylogenomics_setup.done"] + } + +steps_to_check = ["setup", "orthology", "filter-orthology", "align", "filter-align", "njtree", "modeltest", "mltree", "speciestree"] +checkpoint_file_dict = { + "setup": "results/checkpoints/modes/phylogenomics_setup.done", + "orthology": "results/checkpoints/modes/orthology.done", + "filter-orthology": "results/checkpoints/modes/filter_orthology.done", + "align": "results/checkpoints/modes/align.done", + "filter-align": "results/checkpoints/modes/filter_align.done", + "speciestree": "results/checkpoints/modes/speciestree.done", + "njtree": "results/checkpoints/modes/njtree.done", + "mltree": "results/checkpoints/modes/trees.done", + "modeltest": "results/checkpoints/modes/modeltest.done" +} + + +outdir_dict = { + "setup": ["results/orthology/busco/busco_set", "results/assemblies", "results/downloaded_genomes"], + "orthology": ["results/orthology/busco"], + "align": ["results/alignments"], + "filter-orthology": ["results"], + "align": ["results"], + "filter-align": ["results/alignments/trimmed", "results/alignments/filtered"], + "speciestree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step + "njtree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step + "mltree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step + "modeltest": ["results/modeltest"] +} diff --git a/bin/phylociraptor_modules/usageinfo.py b/bin/phylociraptor_modules/usageinfo.py new file mode 100644 index 0000000..0823bda --- /dev/null +++ b/bin/phylociraptor_modules/usageinfo.py @@ -0,0 +1,306 @@ + +def hi(): + print("hi") + +#some basic variables needed: +try: + with open(".version", "r") as file: + version = file.readline().strip("\n") +except FileNotFoundError: + version = "unkown" + +phylociraptor = """ + Welcome to + __ __ _ __ + ____ / /_ __ __/ /___ _____(_)________ _____ / /_____ _____ + / __ \/ __ \/ / / / / __ \/ ___/ / ___/ __ `/ __ \/ __/ __ \/ ___/ + / /_/ / / / / /_/ / / /_/ / /__/ / / / /_/ / /_/ / /_/ /_/ / / + / .___/_/ /_/\__, /_/\____/\___/_/_/ \__,_/ .___/\__/\____/_/ +/_/ /____/ /_/ + + the rapid phylogenomic tree calculator, v%s +""" % version + +default_help = phylociraptor + """ + +Usage: phylociraptor + +Commands: + setup Setup pipeline + orthology Infer orthologs in a set of genomes + filter-orthology Filter orthology results + align Create alignments for orthologous genes + filter-align Trim and filter alignments + modeltest Calculate gene-trees and perform modeltesting + mltree Calculate Maximum-Likelihood phylogenomic trees + speciestree Calculate species tree + njtree Calculate Neighbor-Joining tree + + report Create a HTML report of the run + check Quickly check status of the run + + -v, --version Print version + -h, --help Display help + +Examples: + To see options for the setup step: + ./phylociraptor setup -h + + To run orthology inferrence for a set of genomes on a SLURM cluster: + ./phylociraptor orthology -t slurm -c data/cluster-config-SLURM.yaml + + To filter alignments overwriting the number of parsimony informative sites set in the config file: + ./phylciraptor filter-align --npars_cutoff 12 + +""" +standard_arguments= """ +Argumemts: + -t, --cluster Specify cluster type. Options: slurm, sge, torque, serial. + -c, --cluster-config Specify Cluster config file path. Default: data/cluster-config-CLUSTERTYPE.yaml.template + -f, --force Soft force runmode which has already been run. + -F, --FORCE Hard force runmode recreating all output. + + --dry Make a dry run. + --verbose Display more output. + -h, --help Display help. +""" + +additional_arguments = """ + +Additonal customization (optional): + --singularity= Pass additional arguments to the singularity containers (eg. additional bindpoints). + Have to be put under quotes " or ' + --snakemake= Pass additional arguments to snakemake. + Have to be put under quotes " or ' + --rerun-incomplete This can be used if the analysis fails with an error indicating incomplete files. + Will be passed on to snakemake. Equivalent to --snakemake="--rerun-incomplete". """ + +setup_help = """ +phylociraptor setup - will prepare your analysis + +Usage: phylociraptor setup +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) + --busco_set BUSCO set to download. (Default: value from config.yaml) + --samples_csv Samples CSV file path. (Default: value from config.yaml) + --add_genomes Will only add additional genomes specified in th CSV file. +""" + +orthology_help = """ +phylociraptor orthology - Will infer orthologous genes in a set of genomes. + +Usage: phylociraptor orthology +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) + --busco_threads Number of threads for each BUSCO run. (Default: value from config.yaml) + --augustus_species Pretrained species for Augustus. (Deafult: value from config.yaml) + --additional_params Additional parameter passed on to BUSCO. Must be placed inside quotes. (Default: value from config.yaml) + +""" + +forthology_help = """ +phylociraptor filter-orthology - Will filter orthology results produced by phylociraptor orthology. + +Usage: phylociraptor filter-orthology + +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) + --dupseq Set how occasionally found duplicated sequences should be handled. + Options: persample; remove only samples with duplicated sequences + perfiler; remove complete file + (Default: value from config.yaml) + --cutoff Minimum BUSCO completeness for a sample to be kept. + (Default: value from config.yaml) + --minsp Mimimum number of species that need to have a BUSCO gene for it to be kept. + (Default: value from config.yaml) + --seq_type Type of sequence data to use. Options (aa, nu). + (Default: value from config.yaml) + +""" + +align_help = """ +phylociraptor align - Will create alignments for a set of single-copy orthologous genes. + +Usage: phylociraptor align + +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) + --method Alignment method. Options: mafft (Default: mafft; read from config.yaml) + --parameters Commandline arguments for alignment method. (Default: read from config.yaml) + --threads Number of threads for alignment step. (Default: read from config.yaml) + +""" + +falign_help = """ +phylociraptor filter-align - Will filter alignments. + +Usage: phylociraptor filter-align + +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) + --min_parsimony_sites Minimum number of parsimony informative sites in each alignments. + --method Trimming method. Options: trimal, aliscore (Default: read from config.yaml) + --parameters Commandline arguments for trimming method. (Default: read from config.yaml) + +""" + +tree_help = """ +phylociraptor tree - Will calculate phylogenomic trees based on supermatrix (concatenated alignment). + +Usage: phylociraptor tree + +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) +""" + +sptree_help = """ +phylociraptor speciestree - Will calculate single-gene trees and a species tree. + +Usage: phylociraptor speciestree + +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) +""" + +njtree_help = """ +phylociraptor njtree - Will calculate a NJ tree. + +Usage: phylociraptor njtree + +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) +""" + + +model_help = """ +phylociraptor modeltest - Will perform substitution model tests and calculate a gene tree for each alignment. + +Usage: phylociraptor modeltest + +""" + standard_arguments + additional_arguments + """ + --config-file Custom config-file path. (Default: data/config.yaml) +""" + +report_help = """ +phylociraptor report - Will create a HTML report of the run + +Usage: phylociraptor report + +Argumemts: + --verbose Display more output. + --figure Create a single figure report. + This only works after modeltest has finished. + --config-file Relative custom config-file path. Only required for --figure (Default: data/config.yaml) + -h, --help Display help. + +""" + +check_help = """ +phylociraptor check - Quickly check the status of the run. + +Usage: phylociraptor check + +Argumemts: + --verbose Display more output. + -h, --help Display help. + +""" + +util_help = """ +phylociraptor util - Utilities for a-posteriori analyses of phylociraptor results. + +Usage: phylociraptor util + +Argumemts: + get-lineage retreive full lineage information for all included samples. + estimate-conflict estimate conflict between trees based on the occurence of quartets. + plot-tree plot one or more trees. + plot-conflict plot conflicts between two trees based on quartet comparison. + -h, --help Display help. + +""" + +util_lineage_help = """ +phylociraptor util get-lineage - Download lineage information from NCBI for included samples. + +This is used in other phylociraptor utils to enhance functionality. + +Usage: phylociraptor util get-lineage + +Arguments: + -g, --genomefile Relative path to download_genomes_statistics.txt file + Default: results/statistics/downloaded_genomes_statistics.txt + -o, --outfile Output file name. + --quiet Suppress on-screen output. + +""" + +util_estimate_conflict_help = """ +phylociraptor util estimate-conflict - Estimates conflicts between trees based on quartets. + +Usage: phylociraptor util estimate-conflict + +Required Arguments: + -i, --intrees Relative paths to input trees in results folder. (Default: all) + Default: results/statistics/downloaded_genomes_statistics.txt + -o, --outprefix Output file name prefix. + -q, --nquartets Total number of quartets to be calculated. Use either this or --stopby. + -b, --stopby Stoping criterion. There are two options: + -b conflicts=100 stops as soon as 100 conflicts have been found. + --stopby tipcoverage=100 stops as soon as every tip is in at least 100 quartets. +Optional Arguments: + -s, --seed Random seed number for reproducibility. (Default: random) + -l, --lineagefile Lineagefile created with phylociraptor util get-lineage. + Mandatory when using-a/--selecttaxa. + -a, --selecttaxa Sample quartets only from tips beloning to specific taxa. + Examples: --selecttaxa order=Diperta,Hymenoptera + -t, --threads Number of threads. (Default: 1) + + --quiet Suppress on-screen output. + +""" + +util_plot_tree_help = """ +phylociraptor util plot-tree - Estimates conflicts between trees based on quartets. + +Usage: phylociraptor util plot-tree + +Required Arguments: + -i, --intrees Relative paths to input trees in results folder. (Default: all) + Default: results/statistics/downloaded_genomes_statistics.txt + -o, --outprefix Output file name prefix. + -l, --lineagefile Lineagefile created with phylociraptor util get-lineage. + -e, --level Taxonomic level in lineage file which should be plotted. + +Optional Arguments: + -s, --seed Random seed number for reproducibility. + -g, --outgroup Comma seperated list of tips which should be used as Outgroup. + Trees will be rerooted accordingly. + --quiet Suppress on-screen output. + +""" + +util_plot_conflict_help = """ +phylociraptor util plot-conflict - Visualizes conflicts between two trees. + +Usage: phylociraptor util plot-conflict + +Required Arguments: + -i, --intrees Two trees for which conflicts should be visualized. + Naming follows the first column in the *.treelist.csv file. + Example: -i T1-T5 will plot conflicts between T1 and T5 in the + treelist file. + -q, --quartetfile *.quartets.csv file from phylociraptor util estimate-conflicts. + -r, --treelist *.treelist.csv file from phylociraptor util estimate-conflicts. + +Optional Arguments: + -s, --seed Random seed number for reproducibility. + -l, --lineagefile Lineagefile created with phylociraptor util get-lineage. + Is required when -e/--level should be used. + -e, --level Taxonomic level in lineage file which should be plotted. + Has to be used together with -l/--lineagefile. + -g, --outgroup Comma seperated list of tips which should be used as Outgroup. + Trees will be rerooted accordingly. + --quiet Suppress on-screen output. + +""" diff --git a/phylociraptor b/phylociraptor index 82a0cac..6b8ff9c 100755 --- a/phylociraptor +++ b/phylociraptor @@ -1,229 +1,14 @@ #!/usr/bin/env python3 import sys, os, io +sys.path.insert(0, os.getcwd()+"/bin") +from phylociraptor_modules.usageinfo import * +from phylociraptor_modules.outfiles import * import time import subprocess import argparse import glob -#some basic variables needed: -try: - with open(".version", "r") as file: - version = file.readline().strip("\n") -except FileNotFoundError: - version = "unkown" - -phylociraptor = """ - Welcome to - __ __ _ __ - ____ / /_ __ __/ /___ _____(_)________ _____ / /_____ _____ - / __ \/ __ \/ / / / / __ \/ ___/ / ___/ __ `/ __ \/ __/ __ \/ ___/ - / /_/ / / / / /_/ / / /_/ / /__/ / / / /_/ / /_/ / /_/ /_/ / / - / .___/_/ /_/\__, /_/\____/\___/_/_/ \__,_/ .___/\__/\____/_/ -/_/ /____/ /_/ - - the rapid phylogenomic tree calculator, v%s -""" % version - -default_help = phylociraptor + """ - -Usage: phylociraptor - -Commands: - setup Setup pipeline - orthology Infer orthologs in a set of genomes - filter-orthology Filter orthology results - align Create alignments for orthologous genes - filter-align Trim and filter alignments - modeltest Calculate gene-trees and perform modeltesting - mltree Calculate Maximum-Likelihood phylogenomic trees - speciestree Calculate species tree - njtree Calculate Neighbor-Joining tree - - report Create a HTML report of the run - check Quickly check status of the run - - -v, --version Print version - -h, --help Display help - -Examples: - To see options for the setup step: - ./phylociraptor setup -h - - To run orthology inferrence for a set of genomes on a SLURM cluster: - ./phylociraptor orthology -t slurm -c data/cluster-config-SLURM.yaml - - To filter alignments overwriting the number of parsimony informative sites set in the config file: - ./phylciraptor filter-align --npars_cutoff 12 - -""" -standard_arguments= """ -Argumemts: - -t, --cluster Specify cluster type. Options: slurm, sge, torque, serial. - -c, --cluster-config Specify Cluster config file path. Default: data/cluster-config-CLUSTERTYPE.yaml.template - -f, --force Soft force runmode which has already been run. - -F, --FORCE Hard force runmode recreating all output. - - --dry Make a dry run. - --verbose Display more output. - -h, --help Display help. -""" - -additional_arguments = """ - -Additonal customization (optional): - --singularity= Pass additional arguments to the singularity containers (eg. additional bindpoints). - Have to be put under quotes " or ' - --snakemake= Pass additional arguments to snakemake. - Have to be put under quotes " or ' - --rerun-incomplete This can be used if the analysis fails with an error indicating incomplete files. - Will be passed on to snakemake. Equivalent to --snakemake="--rerun-incomplete". """ - -setup_help = """ -phylociraptor setup - will prepare your analysis - -Usage: phylociraptor setup -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) - --busco_set BUSCO set to download. (Default: value from config.yaml) - --samples_csv Samples CSV file path. (Default: value from config.yaml) - --add_genomes Will only add additional genomes specified in th CSV file. -""" - -orthology_help = """ -phylociraptor orthology - Will infer orthologous genes in a set of genomes. - -Usage: phylociraptor orthology -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) - --busco_threads Number of threads for each BUSCO run. (Default: value from config.yaml) - --augustus_species Pretrained species for Augustus. (Deafult: value from config.yaml) - --additional_params Additional parameter passed on to BUSCO. Must be placed inside quotes. (Default: value from config.yaml) - -""" - -forthology_help = """ -phylociraptor filter-orthology - Will filter orthology results produced by phylociraptor orthology. - -Usage: phylociraptor filter-orthology - -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) - --dupseq Set how occasionally found duplicated sequences should be handled. - Options: persample; remove only samples with duplicated sequences - perfiler; remove complete file - (Default: value from config.yaml) - --cutoff Minimum BUSCO completeness for a sample to be kept. - (Default: value from config.yaml) - --minsp Mimimum number of species that need to have a BUSCO gene for it to be kept. - (Default: value from config.yaml) - --seq_type Type of sequence data to use. Options (aa, nu). - (Default: value from config.yaml) - -""" - -align_help = """ -phylociraptor align - Will create alignments for a set of single-copy orthologous genes. - -Usage: phylociraptor align - -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) - --method Alignment method. Options: mafft (Default: mafft; read from config.yaml) - --parameters Commandline arguments for alignment method. (Default: read from config.yaml) - --threads Number of threads for alignment step. (Default: read from config.yaml) - -""" - -falign_help = """ -phylociraptor filter-align - Will filter alignments. - -Usage: phylociraptor filter-align - -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) - --min_parsimony_sites Minimum number of parsimony informative sites in each alignments. - --method Trimming method. Options: trimal, aliscore (Default: read from config.yaml) - --parameters Commandline arguments for trimming method. (Default: read from config.yaml) - -""" - -tree_help = """ -phylociraptor tree - Will calculate phylogenomic trees based on supermatrix (concatenated alignment). - -Usage: phylociraptor tree - -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) -""" - -sptree_help = """ -phylociraptor speciestree - Will calculate single-gene trees and a species tree. - -Usage: phylociraptor speciestree - -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) -""" - -njtree_help = """ -phylociraptor njtree - Will calculate a NJ tree. - -Usage: phylociraptor njtree - -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) -""" - - -model_help = """ -phylociraptor modeltest - Will perform substitution model tests and calculate a gene tree for each alignment. - -Usage: phylociraptor modeltest - -""" + standard_arguments + additional_arguments + """ - --config-file Custom config-file path. (Default: data/config.yaml) -""" - -report_help = """ -phylociraptor report - Will create a HTML report of the run - -Usage: phylociraptor report - -Argumemts: - --verbose Display more output. - --figure Create a single figure report. - This only works after modeltest has finished. - --config-file Relative custom config-file path. Only required for --figure (Default: data/config.yaml) - -h, --help Display help. - -""" - -check_help = """ -phylociraptor check - Quickly check the status of the run. - -Usage: phylociraptor check - -Argumemts: - --verbose Display more output. - -h, --help Display help. - -""" - -util_help = """ -phylociraptor util - Utilities for a-posteriori analyses of phylociraptor results. - -Usage: phylociraptor util - -Argumemts: - get-lineage retreive full lineage information for all included samples. - compare-trees compare trees based on the occurence of quartets. - plot-tree plot one or more trees. - plot-conflicts plot conflicts between two trees based on quartet comparison. - -h, --help Display help. - -""" - if sys.version_info[0] < 3: raise Exception("Must be using Python 3") exit(1) @@ -233,45 +18,6 @@ latency_wait = "10" #in seconds singularity_bindpoints = "-B $(pwd)/.usr_tmp/:/usertmp" debug = False #turn debugging mode on (True) and off (False) cluster_config_defaults= {"slurm": "data/cluster-config-SLURM.yaml.template", "sge":"data/cluster-config-SGE.yaml.template", "torque":"data/cluster-config-TORQUE.yaml.template"} -outfile_dict = { - "setup": ["results/checkpoints/modes/phylogenomics_setup.done"], - "orthology": ["results/checkpoints/modes/phylogenomics_setup.done"], - "filter-orthology": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done"], - "align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done"], - "filter-align": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done"], - "speciestree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "njtree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "mltree": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "modeltest": ["results/checkpoints/modes/phylogenomics_setup.done", "results/checkpoints/modes/orthology.done", "results/checkpoints/modes/filter_orthology.done", "results/checkpoints/modes/align.done", "results/checkpoints/modes/filter_align.done"], - "report": ["results/checkpoints/modes/phylogenomics_setup.done"] - } - -steps_to_check = ["setup", "orthology", "filter-orthology", "align", "filter-align", "njtree", "modeltest", "mltree", "speciestree"] -checkpoint_file_dict = { - "setup": "results/checkpoints/modes/phylogenomics_setup.done", - "orthology": "results/checkpoints/modes/orthology.done", - "filter-orthology": "results/checkpoints/modes/filter_orthology.done", - "align": "results/checkpoints/modes/align.done", - "filter-align": "results/checkpoints/modes/filter_align.done", - "speciestree": "results/checkpoints/modes/speciestree.done", - "njtree": "results/checkpoints/modes/njtree.done", - "mltree": "results/checkpoints/modes/trees.done", - "modeltest": "results/checkpoints/modes/modeltest.done" -} - - -outdir_dict = { - "setup": ["results/orthology/busco/busco_set", "results/assemblies", "results/downloaded_genomes"], - "orthology": ["results/orthology/busco"], - "align": ["results/alignments"], - "filter-orthology": ["results"], - "align": ["results"], - "filter-align": ["results/alignments/trimmed", "results/alignments/filtered"], - "speciestree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step - "njtree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step - "mltree": [""], #donefile will have to do as check, because there are several possible output folder combinations for this step - "modeltest": ["results/modeltest"] -} def now(): @@ -1166,27 +912,25 @@ elif args.command == "util": if which_util == "get-lineage": gl_parser = UtilParser(add_help=False) gl_parser.add_argument("-g","--genomefile", action="store", default="results/statistics/downloaded_genomes_statistics.txt") - gl_parser.add_argument("-o", "--output", action="store") + gl_parser.add_argument("-o", "--outfile", action="store") gl_parser.add_argument("--quiet", action="store_true", default=False) gl_args = gl_parser.parse_args(args.arguments) -# print(gl_args) if gl_args.help: - print("help") - #print(help_message(getlineage_help)) + print(help_message(util_lineage_help)) sys.exit(0) - if not gl_args.output: - print("You need to specify an output file with -o/--output") + if not gl_args.outfile: + print(now(), "You need to specify an output file with -o/--outfile") sys.exit(1) if not os.path.isfile(gl_args.genomefile): - print("Genome file not found") + print(now(), "Genome file not found:",gl_args.genomefile) sys.exit(1) - cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/biopython_plus:1.77", "python", "bin/get_lineage.py", gl_args.genomefile, gl_args.output] + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/biopython_plus:1.77", "python", "bin/get_lineage.py", gl_args.genomefile, gl_args.outfile] if debug: print(cmd) for line in execute_command(cmd, not gl_args.quiet): print(line, end="\r") sys.exit(0) - if which_util == "compare-trees": + if which_util == "estimate-conflict": qs_parser = UtilParser(add_help=False) qs_parser.add_argument("-i","--intrees", action="store", default="all") qs_parser.add_argument("-o", "--outfile", action="store") @@ -1195,12 +939,11 @@ elif args.command == "util": qs_parser.add_argument("-t", "--threads", action="store", default="1") qs_parser.add_argument("-l", "--lineagefile", action="store") qs_parser.add_argument("-b", "--stopby", action="store", default=None) - qs_parser.add_argument("--selecttaxa", action="store") + qs_parser.add_argument("-a", "--selecttaxa", action="store") qs_parser.add_argument("--quiet", action="store_true", default=False) qs_args = qs_parser.parse_args(args.arguments) -# print(gl_args) if qs_args.help: - print("help") + print(help_message(util_estimate_conflict_help)) sys.exit(0) if not qs_args.outfile: print(now(),"You need to specify an output file with -o/--outfile") @@ -1216,7 +959,7 @@ elif args.command == "util": all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) else: all_trees = qs_args.intrees - cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/phylopy:2", "python", "bin/compare_trees.py", "-i", all_trees, "-o", qs_args.outfile, "-s", qs_args.seed, "-t", qs_args.threads] + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/phylopy:2", "python", "bin/estimate_conflict.py", "-i", all_trees, "-o", qs_args.outfile, "-s", qs_args.seed, "-t", qs_args.threads] if qs_args.lineagefile and qs_args.selecttaxa: cmd += ["-l", qs_args.lineagefile, "--selecttaxa", qs_args.selecttaxa] @@ -1244,7 +987,7 @@ elif args.command == "util": qs_parser.add_argument("--quiet", action="store_true", default=False) qs_args = qs_parser.parse_args(args.arguments) if qs_args.help: - print("help") + print(help_message(util_plot_tree_help)) sys.exit(0) if not qs_args.intrees: print(now(),"No input trees specified") @@ -1261,12 +1004,11 @@ elif args.command == "util": cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-tree.R"] cmd += ["plot", os.getcwd(), qs_args.seed, all_trees, qs_args.outgroup, qs_args.lineagefile, qs_args.level, qs_args.outprefix] - print(cmd) if debug: print(cmd) for line in execute_command(cmd, not qs_args.quiet): print(line, end="\r") - if which_util == "plot-conflicts": + if which_util == "plot-conflict": qs_parser = UtilParser(add_help=False) qs_parser.add_argument("-i","--intrees", action="store", default=None) qs_parser.add_argument("-g", "--outgroup", action="store", default="none") @@ -1274,17 +1016,17 @@ elif args.command == "util": qs_parser.add_argument("-q", "--quartetfile", action="store", default=None) qs_parser.add_argument("-e", "--level", action="store", default="none") qs_parser.add_argument("-s", "--seed", action="store", default="random") - qs_parser.add_argument("--treelist", action="store", default=None) + qs_parser.add_argument("-r", "--treelist", action="store", default=None) qs_parser.add_argument("--quiet", action="store_true", default=False) qs_args = qs_parser.parse_args(args.arguments) if qs_args.help: - print("help") + print(help_message(util_plot_conflict_help)) sys.exit(0) if not qs_args.intrees: print(now(),"No input trees specified") sys.exit(0) elif len(qs_args.intrees.split(",")) != 2: - print(now(), "Need exactly two trees for comparison, paths should be seperated by a comma (,).") + print(now(), "Need exactly two trees for comparison, seperated by a comma (,).") sys.exit(0) else: all_trees = qs_args.intrees @@ -1298,7 +1040,6 @@ elif args.command == "util": cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-tree.R"] cmd += ["conflicts", os.getcwd(), qs_args.seed, all_trees, qs_args.outgroup, qs_args.lineagefile, qs_args.level, qs_args.quartetfile, qs_args.treelist] - print(cmd) if debug: print(cmd) for line in execute_command(cmd, not qs_args.quiet): From f3fefb747cd771b81bcdf8106d9b2b6e199052ce Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 25 May 2022 11:10:49 +0200 Subject: [PATCH 07/26] update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 8087843..8ea8509 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,5 @@ numerical_underflow/ docs/_build/ docs/.DS_Store docs/*/.DS_Store +bin/phylociraptor_modules/__pycache__/ +*.pdf From 4d4552cf69e67ff67e38da16fb1d3ad977726160 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 25 May 2022 11:14:28 +0200 Subject: [PATCH 08/26] phylocriptor docker now by default updates its image as singularity images are pulled inside of it --- phylociraptor-docker | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/phylociraptor-docker b/phylociraptor-docker index 4db93ff..232eb61 100755 --- a/phylociraptor-docker +++ b/phylociraptor-docker @@ -46,7 +46,15 @@ fi TMPFS="-v $(pwd)/.tmp:/tmp" BINDPOINTS="-v $(pwd):/home/$US/phylociraptor" -docker run --rm -it $DOCKER_USER $BINDPOINTS --privileged reslp/phylociraptor:$VER phylociraptor $@ - +REUSE="TRUE" #when this is enabled, the container is continuously updated as singularity images are pulled. So they don't have to be pulled multiple times. +if [[ $REUSE == "TRUE" ]]; then + echo "docker run -it $DOCKER_USER $BINDPOINTS --privileged reslp/phylociraptor:$VER phylociraptor $@" + docker run -it $DOCKER_USER $BINDPOINTS --privileged reslp/phylociraptor:$VER phylociraptor $@ + newid=$(docker container ls -all | grep "reslp/phylociraptor:$VER" | cut -d" " -f 1) + docker commit $newid reslp/phylociraptor:$VER &> /dev/null + docker container rm $newid &> /dev/null +else + docker run --rm -it $DOCKER_USER $BINDPOINTS --privileged reslp/phylociraptor:$VER phylociraptor $@ +fi #Command will look something like this: #docker run --rm -it --user $(id -u):$(id -g) -v $(pwd):/home/$USER/phylociraptor --privileged reslp/phylociraptor:0.9.1 From 607db7388ec529a7446953bee30be3f4c20d8523 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Fri, 27 May 2022 11:54:36 +0200 Subject: [PATCH 09/26] Add R scripts used in phylociraptor util --- bin/plot-similarity-matrix.R | 105 ++++++ bin/plot-tree.R | 620 +++++++++++++++++++++++++++++++++++ 2 files changed, 725 insertions(+) create mode 100644 bin/plot-similarity-matrix.R create mode 100644 bin/plot-tree.R diff --git a/bin/plot-similarity-matrix.R b/bin/plot-similarity-matrix.R new file mode 100644 index 0000000..71dc367 --- /dev/null +++ b/bin/plot-similarity-matrix.R @@ -0,0 +1,105 @@ +library(ggplot2) +library(reshape2) +library(ape) +library(reshape2) +library(tidyverse) + +args <- commandArgs(trailingOnly=TRUE) + +wd <- args[1] +matrix_file <- args[2] +treelistfile <- args[3] +verbose <- TRUE + +setwd(wd) +cat(paste("Working directory: ", getwd(), "\n")) +data <- read.csv(matrix_file) + +data <- melt(data) +colnames(data) <- c("First", "Second", "Similarity") +data$First <- factor(data$First, levels=levels(data$Second)) + +pdf(file="similarity-matrix.pdf", width=20, height=10) +ggplot(data, aes(First, Second, fill=Similarity)) + geom_tile() + ggtitle("% similarity of pairs of trees") + geom_text(aes(label = format(round(Similarity, 4), nsmall=2))) +dev.off() +cat("Plot file: similarity-matrix.pdf created successfully.\n") + + +if (treelistfile != "none") { + ### taken from: https://github.com/andersgs/harrietr/blob/master/R/melt_distance.R + melt_dist <- function(dist, order = NULL, dist_name = 'dist') { + if(!is.null(order)){ + dist <- dist[order, order] + } else { + order <- row.names(dist) + } + diag(dist) <- NA + dist[upper.tri(dist)] <- NA + dist_df <- as.data.frame(dist) + dist_df$iso1 <- row.names(dist) + dist_df <- dist_df %>% + tidyr::gather_(key = "iso2", value = lazyeval::interp("dist_name", dist_name = as.name(dist_name)), order, na.rm = T) + return(dist_df) + } + ### + + + + treelist <- read.csv(treelistfile, header=F, check.names=FALSE, sep="\t") + colnames(treelist) <- c("tree", "path") + + cat("Reading trees from treelist file:\n") + trees <- list() + i <- 1 + for (treepath in treelist$path) { + + trees[[i]]<-read.tree(treepath) + i <- i + 1 + } + + all_tips <- c() + + cat("Extracting all tips from tree...") + for (i in 1:length(trees)) { + all_tips <- c(all_tips, trees[[i]]$tip.label) + } + all_tips <- unique(all_tips) + cat("done\n") + + dists <- list() + cat("Now calculating distances, please be patient...\n") + for (i in 1:length(trees)) { + cat(paste0("Calculating pairwise distances for tree ", i, "...\n")) + dist <- cophenetic.phylo(trees[[i]]) + melted_dist <- melt_dist(dist) + visited <- c() + remaining_tips <- all_tips + names <- c() + distances <- c() + for (sp1 in all_tips) { + visited <- c(visited, sp1) + remaining_tips <- remaining_tips[!remaining_tips %in% visited] + for (sp2 in remaining_tips) { + if (verbose == TRUE) { + cat(paste0("Tree ",i, ": ", sp1, " - ", sp2, "\n")) + } + if (sp1 == sp2) {next} + distance <- subset(melted_dist, (iso1==sp1 & iso2==sp2))$dist + if (length(distance) == 0) { + distance <- subset(melted_dist, (iso1==sp2 & iso2==sp1))$dist + } + if (length(distance) == 0) {distance <- NA} + distances <- c(distances, distance) + names <- c(names, paste0(sp1, "-",sp2)) + } + } + df <- data.frame(distances) + rownames(df) <- names + colnames(df) <- paste0("tree", i) + dists[[i]] <- df + } + complete_df <- do.call("cbind", dists) + complete_df <- t(complete_df) + write.csv(file="pairwise-distance-matrix.csv", complete_df, sep=",") +} + diff --git a/bin/plot-tree.R b/bin/plot-tree.R new file mode 100644 index 0000000..eea235f --- /dev/null +++ b/bin/plot-tree.R @@ -0,0 +1,620 @@ +library(phytools, quietly = T) +library(ape, quietly = T) +library(ggtree, quietly = T) +library(magrittr, quietly = T) +library(tidytree, quietly = T) +library(patchwork, quietly = T) +library(cowplot, quietly = T) +#library(stringr) +library(ggplot2, quietly = T) +library(reshape2, quietly = T) + +args <- commandArgs(trailingOnly=TRUE) + +# variables which need to be passed to the script from the command line +runmode <- args[1] # different run modes should be: plot, conflicts and +wd <- args[2] +setwd(wd) +print(wd) +print(args) +if (runmode == "plot") { + print("Will plot trees...") + seed <- args[3] + treenames <- args[4] + outgroup <- args[5] + lineage_file <- args[6] + level <- args[7] + outprefix <- args[8] + +} else if (runmode == "conflicts") { + cat("Will plot conflicts...\n") + seed <- args[3] + treenames <- args[4] + outgroup <- args[5] + lineage_file <- args[6] + level <- args[7] + conflictfile <- args[8] + treelistfile <- args[9] +} else { + cat("Runmode not recognized...\n") + quit(1) +} + + +# reformat commandline argument: +outgroup <- strsplit(outgroup,",")[[1]] +treenames <- strsplit(treenames,",")[[1]] + +print(outgroup) +print(treenames) +print(conflictfile) +print(lineage_file) + +#set seed if specified +if (seed != "random") { + set.seed(seed) +} + + +#quit(1) + +#the settings here are for testing +#level <- "order" +#runmode <- "plot" +#treenames <- c('results/phylogeny-80/iqtree/mafft-trimal/concat.contree', 'results/phylogeny-80/iqtree/clustalo-trimal/concat.contree', 'results/phylogeny-80/iqtree/clustalo-aliscore/concat.contree', 'results/phylogeny-80/iqtree/mafft-aliscore/concat.contree', 'results/phylogeny-85/iqtree/mafft-trimal/concat.contree', 'results/phylogeny-85/iqtree/clustalo-trimal/concat.contree', 'results/phylogeny-85/iqtree/clustalo-aliscore/concat.contree', 'results/phylogeny-85/iqtree/mafft-aliscore/concat.contree', 'results/phylogeny-75/iqtree/mafft-trimal/concat.contree', 'results/phylogeny-75/iqtree/clustalo-trimal/concat.contree', 'results/phylogeny-75/iqtree/clustalo-aliscore/concat.contree', 'results/phylogeny-75/iqtree/mafft-aliscore/concat.contree', 'results/phylogeny-80/astral/mafft-trimal/species_tree.tre', 'results/phylogeny-80/astral/clustalo-trimal/species_tree.tre', 'results/phylogeny-80/astral/clustalo-aliscore/species_tree.tre', 'results/phylogeny-80/astral/mafft-aliscore/species_tree.tre', 'results/phylogeny-85/astral/mafft-trimal/species_tree.tre', 'results/phylogeny-85/astral/clustalo-trimal/species_tree.tre', 'results/phylogeny-85/astral/clustalo-aliscore/species_tree.tre', 'results/phylogeny-85/astral/mafft-aliscore/species_tree.tre', 'results/phylogeny-75/astral/mafft-trimal/species_tree.tre', 'results/phylogeny-75/astral/clustalo-trimal/species_tree.tre', 'results/phylogeny-75/astral/clustalo-aliscore/species_tree.tre', 'results/phylogeny-75/astral/mafft-aliscore/species_tree.tre') + +#outgroup <- c("Ramazzottius_varieornatus", "Hypsibius_dujardini") +#lineage_file <- "lineage_arthropods.txt" +#conflictfile <- "output.txt" + +# load lineage information file and fill missing values +if (lineage_file == "none") { + cat("No lineage file found.\n") +} else { + lineages <- read.csv(lineage_file, header=T, sep="\t", na=c("")) + lineages[is.na(lineages)] <- "missing" +} + + +#load and format quartet conflicts file: +if (runmode == "conflicts") { + if (conflictfile != "none") { + conflicts <- read.csv(conflictfile, header=T, check.names=FALSE) + rownames(conflicts) <- conflicts$quartet + conflicts$quartet <- NULL + conflicts <- t(conflicts) + treelist <- read.csv(treelistfile, header=F, check.names=FALSE, sep="\t") + colnames(treelist) <- c("tree", "path") + #print(conflicts) + } else { + cat("Conflicts file not found. Will stop.\n") + quit(1) + } + +} + + + + +# select comparison for visualization: this is debug code! +#conflicts_t <- t(conflicts[13,]) +#table(conflicts_t) +#rownames(conflicts_t) +#conflicts_t["Drosophila_azteca,Drosophila_rufa-Musca_domestica,Phortica_variegata",] +#conflict_quartets <- conflicts_t +#conflict_quartets <- names(conflicts_t[conflicts_t == 0,]) + + +bs_support <- 90 +pb_support <- 0.95 + + +# the code to draw triangles comes from here: +# https://gist.github.com/jeanmanguy/fe6b1ee46476f29149455124e2a14529 + +get_offsprings <- function(node_to_collapse, phylo) { + # todo: assert that node is scalar + phylo %>% + tidytree::as_tibble() %>% + tidytree::offspring(.node = node_to_collapse) %>% + dplyr::pull(node) +} + +get_offspring_tips <- function(phylo, node_to_collapse) { + # todo: assert that node is scalar + phylo %>% + ggtree::fortify() %>% + tidytree::offspring(.node = node_to_collapse) %>% + dplyr::filter(isTip) %>% + dplyr::pull(node) +} + +remove_collapsed_nodes <- function(phylo, nodes_to_collapse) { + nodes <- purrr::map(nodes_to_collapse, get_offsprings, phylo = phylo) %>% unlist() + phylo %>% + ggtree::fortify() %>% + tibble::as_tibble() %>% + dplyr::filter(!node %in% nodes) +} + +get_collapsed_offspring_nodes_coordinates <- function(phylo, nodes) { + phylo %>% + ggtree::fortify() %>% + tibble::as_tibble() %>% + dplyr::filter(node %in% nodes) %>% + dplyr::summarise(xmax = max(x), xmin = min(x), ymax = max(y), ymin = min(y)) +} + +get_collapsed_node_coordinates <- function(phylo, node_to_collapse) { + # todo: assert that node is scalar + phylo %>% + ggtree::fortify() %>% + tibble::as_tibble() %>% + dplyr::filter(node == node_to_collapse) %>% + dplyr::select(x, y) +} + +get_triangle_coordinates_ <- function(node, phylo, mode = c("max", "min", "mixed")) { + mode <- match.arg(mode) + # for one + tips_to_collapse <- get_offspring_tips(phylo, node) + node_to_collapse_xy <- get_collapsed_node_coordinates(phylo, node) + tips_to_collapse_xy <- get_collapsed_offspring_nodes_coordinates(phylo, tips_to_collapse) + + triange_df <- mode %>% switch( + max = dplyr::data_frame( + x = c(node_to_collapse_xy$x, tips_to_collapse_xy$xmax, tips_to_collapse_xy$xmax), + y = c(node_to_collapse_xy$y, tips_to_collapse_xy$ymin, tips_to_collapse_xy$ymax) + ), + min = data_frame( + x = c(node_to_collapse_xy$x, tips_to_collapse_xy$xmin, tips_to_collapse_xy$xmin), + y = c(node_to_collapse_xy$y, tips_to_collapse_xy$ymin, tips_to_collapse_xy$ymax) + ), + mixed = data_frame( + x = c(node_to_collapse_xy$x, tips_to_collapse_xy$xmin, tips_to_collapse_xy$xmax), + y = c(node_to_collapse_xy$y, tips_to_collapse_xy$ymin, tips_to_collapse_xy$ymax) + ) + ) + return(triange_df) +} + +get_triangle_coordinates <- function(phylo, nodes, mode = c("max", "min", "mixed")) { + mode <- match.arg(mode) + # todo: make sure there is no conflict between nodes (nesting...) + purrr::map(nodes, get_triangle_coordinates_, phylo = phylo, mode = mode) %>% + dplyr::bind_rows(.id = "node_collapsed") +} + +# till here + +is_node_supported <- function(support) { + if (length(support) == 0) {return("no")} + if (support <= 1) { #we are dealing with posterior probabilities + if (support <= pb_support){ + return("no") # no support + } else {return("yes")} + } else { # we are dealing with bootstrap values + if (support <= bs_support){ + return("no") #no support + } else {return("yes")} + + } +} + +generate_color <- function(label) { + colors <- grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)] + +} + +get_conflicts <- function(tree, conflict_quartets) { + if (length(conflict_quartets) == 0) {print("WARNING: NO CONFLICTS IN SELECTION!")} + edge_thickness <- rep(1, length(tree$edge.length)+1) + for (quat in conflict_quartets) { + left <- strsplit(quat, "-")[[1]][1] + right <- strsplit(quat, "-")[[1]][2] + left1 <- strsplit(left, ",")[[1]][1] + left2 <- strsplit(left, ",")[[1]][2] + right1 <- strsplit(right, ",")[[1]][1] + right2 <- strsplit(right, ",")[[1]][2] + #print(paste0(left1, "->", left2)) + #print(paste0(right1, "->", right2)) + # npleft <- nodepath(tree, from=which(tree$tip.label == left1), to=which(tree$tip.label == left2)) + # npright <- nodepath(tree, from=which(tree$tip.label == right1), to=which(tree$tip.label == right2)) + #identify nodes between tips + mcanodel <- getMRCA(tree, c(which(tree$tip.label == left1), which(tree$tip.label == left2))) + mcanoder <- getMRCA(tree, c(which(tree$tip.label == right1), which(tree$tip.label == right2))) + pathl1 <- nodepath(tree, from=mcanodel, to=which(tree$tip.label == left1)) + pathl2 <- nodepath(tree, from=mcanodel, to=which(tree$tip.label == left2)) + pathr1 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right1)) + pathr2 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right2)) + #print(paste0(mcanodel, " ", mcanoder)) + npancestor <- nodepath(tree, from=mcanodel, to=mcanoder) + #print(paste0(mcanodel, " ", mcanoder)) + all_nodes_to_highlight <- c(pathl1, pathl2, pathr1, pathr2, npancestor) + #all_nodes_to_highlight <- c(npancestor) + + #print(all_nodes_to_highlight) + # now select which edges in the tree + p <- ggtree(tr = tree, ladderize = FALSE) + edgeorder <- data.frame(parent=p$data$parent, node=p$data$node) + df <- data.frame(one=edgeorder$parent %in% all_nodes_to_highlight, two=edgeorder$node %in% all_nodes_to_highlight) + + #df <- data.frame(one=tree$edge[,2] %in% all_nodes_to_highlight, two=tree$edge[,1] %in% all_nodes_to_highlight) + boolCols <- sapply(df, is.logical) + selected_rows <- rowSums(df[,boolCols]) == sum(boolCols) + #selected_rows <- rev(selected_rows) + edges_with_conflict <- which(selected_rows == TRUE) + for (edge in edges_with_conflict) { + edge_thickness[edge] = edge_thickness[edge] + 1 + } + + } + thickness <- data.frame(edge=1:length(edge_thickness), conflict=edge_thickness) + thickness$logthick <- log(thickness$conflict+1) + return(thickness) + +} + +get_conflicts_and_support <- function(tree, conflict_quartets) { + if (length(conflict_quartets) == 0) {print("WARNING: NO CONFLICTS IN SELECTION!")} + edge_thickness <- rep(1, length(tree$edge.length)+1) + for (quat in names(conflict_quartets)) { + if (quat == "") {next} # not sure why this happens, but apparently it does sometimes + if (length(strsplit(quat, ",")[[1]]) < 3) { + print(paste0("Not a proper Quartet. Will skip: ", quat)) + next + } + left <- strsplit(quat, "-")[[1]][1] + right <- strsplit(quat, "-")[[1]][2] + left1 <- strsplit(left, ",")[[1]][1] + left2 <- strsplit(left, ",")[[1]][2] + right1 <- strsplit(right, ",")[[1]][1] + right2 <- strsplit(right, ",")[[1]][2] + #print(paste0(left1, "->", left2)) + #print(paste0(right1, "->", right2)) + # npleft <- nodepath(tree, from=which(tree$tip.label == left1), to=which(tree$tip.label == left2)) + # npright <- nodepath(tree, from=which(tree$tip.label == right1), to=which(tree$tip.label == right2)) + #identify nodes between tips + mcanodel <- getMRCA(tree, c(which(tree$tip.label == left1), which(tree$tip.label == left2))) + mcanoder <- getMRCA(tree, c(which(tree$tip.label == right1), which(tree$tip.label == right2))) + pathl1 <- nodepath(tree, from=mcanodel, to=which(tree$tip.label == left1)) + pathl2 <- nodepath(tree, from=mcanodel, to=which(tree$tip.label == left2)) + pathr1 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right1)) + pathr2 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right2)) + #print(paste0(mcanodel, " ", mcanoder)) + npancestor <- nodepath(tree, from=mcanodel, to=mcanoder) + #print(paste0(mcanodel, " ", mcanoder)) + all_nodes_to_highlight <- c(pathl1, pathl2, pathr1, pathr2) + #all_nodes_to_highlight <- c(npancestor) + + #print(all_nodes_to_highlight) + # now select which edges in the tree + p <- ggtree(tr = tree, ladderize = FALSE) + edgeorder <- data.frame(parent=p$data$parent, node=p$data$node) + df <- data.frame(one=edgeorder$parent %in% all_nodes_to_highlight, two=edgeorder$node %in% all_nodes_to_highlight) + + #df <- data.frame(one=tree$edge[,2] %in% all_nodes_to_highlight, two=tree$edge[,1] %in% all_nodes_to_highlight) + boolCols <- sapply(df, is.logical) + selected_rows <- rowSums(df[,boolCols]) == sum(boolCols) + #selected_rows <- rev(selected_rows) + edges_with_conflict <- which(selected_rows == TRUE) + if (conflict_quartets[quat] == 0 ) { + for (edge in edges_with_conflict) { + edge_thickness[edge] = edge_thickness[edge] + 1 + } + } + #if (conflict_quartets[quat,] == 1 ) { + # for (edge in edges_with_conflict) { + # edge_thickness[edge] = edge_thickness[edge] - 1 + # } + #} + + + } + thickness <- data.frame(edge=1:length(edge_thickness), conflict=edge_thickness) + thickness$logthick <- log(thickness$conflict+1) + return(thickness) + +} + +#################### +# CODE STARTS HERE # +#################### +# TODO: +# - Create Order annotations for singletons +# - Add additional support categories + + + +# generate some colors + + +if (runmode == "plot") { + if (level == "none") { + cat("Plotting tree(s) without lineage information.\n") + all_supports_list <- list() + all_names <- c() + single <- FALSE + for (ntree in 1:length(treenames)) { + #extract filename information: + treename <- treenames[ntree] + bs_cutoff <- strsplit(strsplit(treename,"/")[[1]][2],"-")[[1]][2] + algorithm <- strsplit(treename,"/")[[1]][3] + alitrim <- strsplit(treename,"/")[[1]][4] + prefix <- paste( algorithm, alitrim, bs_cutoff, sep="-") + + #reroot tree + tree <- read.tree(treename) + if (outgroup != "none") { #reroot tree in case an outgroup was specified + rootnode <- getMRCA(tree, outgroup) + tree <- root(tree, node=rootnode, resolve.root = TRUE) + } + #plot(tree) + ntips <- length(tree$tip.label) + + + # this is where we decide how to plot (conflicts or not). Maybe this will be refactored later... + cat(paste0("Plot tree: ", prefix, "\n")) + if (runmode == "conflicts") { + print("Extracting Conflicts:") + conflicts_info <- get_conflicts_and_support(tree, conflict_quartets) + print(conflicts_info) + + t2 <- ggtree(tree, branch.length='none', aes(size=conflicts_info$conflict)) +theme(legend.position = c("none")) +geom_tiplab() + + } else { + cat(" plotting without conflicts...\n") + t2 <- ggtree(tree, branch.length='none') + theme(legend.position = c("none")) +geom_tiplab() + + } + t2 <- t2 + coord_cartesian(clip = 'off') + minx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[1] + maxx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[2] + t2 <- t2+xlim(minx, maxx+40) # to create space for the labels + # now we create clade labels for the tree + #clade_label_df <- as.data.frame(nodes_to_collapse) + #clade_label_df$name <- rownames(clade_label_df) + #colnames(clade_label_df) <- c("node", "name") + + #t2 <- t2 + geom_cladelab(data = clade_label_df, mapping = aes(node = node, label = name, color = name), fontsize = 2, offset=27, offset.text=0.3) + #ggplot_build(t2) + #factor(lineages["order"][,1]) + + # extract legend then remove it + + + + cat(paste0(" write PDF: ",prefix,"-",level,"-tree.pdf\n")) + pdf(file = paste0(prefix,"-",level,"-tree.pdf"), width = 10, height=70) + print(t2 + theme(legend.position="none"))#+ plot_layout(guides = 'none')# & theme(legend.position='bottom') + #plot.new() + #print(legend) + dev.off() + if (single == TRUE) {break} + } + } else { + if (length(na.omit(lineages[,level])) <= 11) { + # prettier colors when there are not too many different models + cols <- brewer.pal(length(node_names), "BrBG") + } else { + # we need more colors + color <- grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)] + if (length(na.omit(unique(lineages[,level]))) > length(color)){ + # if there are more taxa than available colors sample colors with replacement, this could result in some groups having the same color + cols <- sample(color, length(na.omit(unique(lineages[,level]))), replace=T) + } else {cols <- sample(color, length(na.omit(unique(lineages[,level]))))} + } + + names(cols) <- na.omit(unique(lineages[,level])) + + cat("The awesome code for plotting subtrees as triangles comes from here: https://jean.manguy.eu/post/subtrees-as-triangles-with-ggtree/\n") + all_supports_list <- list() + all_names <- c() + single <- FALSE + for (ntree in 1:length(treenames)) { + #extract filename information: + treename <- treenames[ntree] + bs_cutoff <- strsplit(strsplit(treename,"/")[[1]][2],"-")[[1]][2] + algorithm <- strsplit(treename,"/")[[1]][3] + alitrim <- strsplit(treename,"/")[[1]][4] + prefix <- paste( algorithm, alitrim, bs_cutoff, sep="-") + cat(paste0("Plot tree: ", prefix, "\n")) + #reroot tree + tree <- read.tree(treename) + if (outgroup != "none") { #reroot tree in case an outgroup was specified + rootnode <- getMRCA(tree, outgroup) + tree <- root(tree, node=rootnode, resolve.root = TRUE) + } + #plot(tree) + ntips <- length(tree$tip.label) + + node_names <- c() + node_names_support <- c() + nodes_to_collapse <- c() + node_supports <- c() + for (name in unique(lineages[,level])) { + which_tips <- lineages$name[lineages[level] == name][lineages$name[lineages[level] == name] %in% tree$tip.label] + #print(which_tips) + node <- getMRCA(tree, c(which_tips)) + #print(node) + if (length(node) != 0) { + #check if this taxon level is monophyletic + descendants <- tree$tip.label[getDescendants(tree=tree, node=node)] + descendants <- descendants[!is.na(descendants)] + tiptax <- lineages[level][,1][lineages$name %in% descendants] + + # the next check is true if all tips have the same taxonomic level + if (length(unique(tiptax)) == 1){ + cat(paste0(" OK ", name, "\n")) + #implement check if all descendents have the same label + nodes_to_collapse <- c(nodes_to_collapse, node) + node_supports <- c(node_supports, is_node_supported(as.double(tree$node.label[node-ntips]))) + node_names <- c(node_names, name) + node_names_support <- c(node_names_support, name) + } else {cat(paste0(" PARAPHYLETIC ", name, "\n")) + node_supports <-c(node_supports, "notmono") + node_names_support <- c(node_names_support, name) + } + } else { + cat(paste0(" SINGLETON ", name, "\n")) + } + } + + #gather support values + cat(" Gather support values...\n") + names(node_supports) <- node_names + node_supports <- node_supports[!is.na(names(node_supports))] # get rid of missing values from NAs when taxon level is missing + all_supports_list[[ntree]] <- node_supports + all_names <- c(all_names, prefix) + + cat(" Collapse tree...\n") + names(nodes_to_collapse) <- node_names + collapsed_tree_df <- tree %>% + remove_collapsed_nodes(nodes = nodes_to_collapse) + + triangles_df <- tree %>% + get_triangle_coordinates(nodes_to_collapse) + + cat(" Plot collapsed tree...\n") + t1 <- ggtree::ggtree(collapsed_tree_df) + + geom_polygon( + data = triangles_df, + mapping = aes(group = node_collapsed, fill = node_collapsed), + color = "#333333" + ) + + #scale_fill_brewer(palette = "Set1") + + scale_fill_manual(values = cols) + + theme( + strip.background = element_blank() + ) + t1 <- t1 + scale_x_reverse() + + #need to create a dataframe with just the right taxonomic level to be used for coloring the tree... + simpdf <- lineages[c("name",level)] + colnames(simpdf) <- c("name", "lineage") + + + cat(" Plot second tree without conflicts...\n") + t2 <- ggtree(tree, branch.length='none') %<+% simpdf + geom_tiplab(aes(color = factor(lineage)),size=2, align=TRUE, geom="text") +scale_color_manual(values=cols) +theme(legend.position = c("none")) + + t2 <- t2 + coord_cartesian(clip = 'off') + minx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[1] + maxx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[2] + t2 <- t2+xlim(minx, maxx+40) # to create space for the labels + # now we create clade labels for the tree + clade_label_df <- as.data.frame(nodes_to_collapse) + clade_label_df$name <- rownames(clade_label_df) + colnames(clade_label_df) <- c("node", "name") + + t2 <- t2 + geom_cladelab(data = clade_label_df, mapping = aes(node = node, label = name, color = name), fontsize = 2, offset=27, offset.text=0.3) + + # extract legend then remove it + legend <- get_legend(t1) + t1 <- t1 + theme(legend.position="none") + + layout <- " + AABB + " + + cat(paste0(" Write PDF...",prefix,"-",level,"-tree.pdf\n")) + pdf(file = paste0(prefix,"-",level,"-tree.pdf"), width = 10, height=70) + print(t2 + t1 + theme(legend.position="none") + plot_layout(design = layout))#+ plot_layout(guides = 'none')# & theme(legend.position='bottom') + dev.off() + if (single == TRUE) {break} + } + + cat("Generating overview support plot now...\n") + support_df <- as.data.frame(do.call(cbind, all_supports_list)) + support_df + colnames(support_df) <- all_names + support_df + support_df[all_names] <- lapply(support_df[all_names] , factor) + + support_cols <- c("#d53e4f", "#ffffbf","#3288bd") + names(support_cols) <- c("no", "notmono", "yes") + sdf <- melt(t(support_df)) + colnames(sdf) <- c("tree", "group", "supported") + pdf(file=paste0("compare-", level, ".pdf"), width=10, height=10) + print(ggplot(sdf, aes(x = tree, y = group)) + geom_tile(aes(fill=supported),colour = "white") + scale_fill_manual(values=support_cols) +theme_minimal()+theme(axis.title.x=element_blank(),axis.title.y=element_blank(),axis.text.x = element_text(angle = 45, vjust = 0, hjust=0))+scale_x_discrete(position = "top")) + dev.off() + } +}else if (runmode == "conflicts") { + print("plotting conflicts") + treepath1 <- treelist$path[treelist["tree"] == treenames[1]] + bs_cutoff <- strsplit(strsplit(treepath1,"/")[[1]][2],"-")[[1]][2] + algorithm <- strsplit(treepath1,"/")[[1]][3] + alitrim <- strsplit(treepath1,"/")[[1]][4] + prefix1 <- paste( algorithm, alitrim, bs_cutoff, sep="-") + + treepath2 <- treelist$path[treelist["tree"] == treenames[2]] + bs_cutoff <- strsplit(strsplit(treepath2,"/")[[1]][2],"-")[[1]][2] + algorithm <- strsplit(treepath2,"/")[[1]][3] + alitrim <- strsplit(treepath2,"/")[[1]][4] + prefix2 <- paste( algorithm, alitrim, bs_cutoff, sep="-") + + tree1 <- read.tree(treepath1) + tree2 <- read.tree(treepath2) + + if (outgroup != "none") { #reroot tree in case an outgroup was specified + rootnode <- getMRCA(tree1, outgroup) + tree1 <- root(tree1, node=rootnode, resolve.root = TRUE) + rootnode <- getMRCA(tree2, outgroup) + tree2 <- root(tree2, node=rootnode, resolve.root = TRUE) + } else {cat("No outgroup was set. The plotted tree comparison may look weird.\n")} + + conflicts_t <- conflicts[paste0(treenames[1], "-", treenames[2]),] + if (length(names(conflicts_t[conflicts_t == 0])) == 0) { + print("No conflicts found. Nothing to plot.") + quit() + } + + cat(paste0("Plot will be based on ", as.character(length(names(conflicts_t[conflicts_t == 0]))), " conflicting quartets.\n")) + + cat("Extracting Conflicts:\n") + conflicts_info1 <- get_conflicts_and_support(tree1, conflicts_t) + conflicts_info2 <- get_conflicts_and_support(tree2, conflicts_t) + + cat("Plot conflitcs between trees...\n") + if (lineage_file != "none") { + cat("Will add lineage information...\n") + simpdf <- lineages[c("name",level)] + colnames(simpdf) <- c("name", "lineage") + t1 <- ggtree(tree1, branch.length='none', aes(size=conflicts_info1$conflict)) %<+% simpdf + geom_tiplab(aes(color = factor(lineage)),size=4, hjust=0, geom="text") +theme(legend.position = c("none")) + scale_size_continuous(range = c(0.2, 5)) + minx <- ggplot_build(t1)$layout$panel_params[[1]]$x.range[1] + maxx <- ggplot_build(t1)$layout$panel_params[[1]]$x.range[2] + t1 <- t1+xlim(minx, maxx+40) + + t2 <- ggtree(tree2, branch.length='none', aes(size=conflicts_info2$conflict)) %<+% simpdf + geom_tiplab(aes(color = factor(lineage)),size=4, offset=-40, geom="text")+theme(legend.position = c("none")) + scale_size_continuous(range = c(0.2, 5)) + #reverse coordinates and create space for labels + minx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[1] + maxx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[2] + t2 <- t2+xlim(maxx+40, minx) + } else { + cat("Plotting without lineage information...\n") + t1 <- ggtree(tree1, branch.length='none', aes(size=conflicts_info1$conflict)) +theme(legend.position = c("none")) + scale_size_continuous(range = c(0.2, 5)) + minx <- ggplot_build(t1)$layout$panel_params[[1]]$x.range[1] + maxx <- ggplot_build(t1)$layout$panel_params[[1]]$x.range[2] + t1 <- t1+xlim(minx, maxx+40) +geom_tiplab(size=4, hjust=0) + + t2 <- ggtree(tree2, branch.length='none', aes(size=conflicts_info2$conflict)) +theme(legend.position = c("none")) + scale_size_continuous(range = c(0.2, 5)) + #reverse coordinates and create space for labels + minx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[1] + maxx <- ggplot_build(t2)$layout$panel_params[[1]]$x.range[2] + t2 <- t2+xlim(maxx+40, minx) +geom_tiplab(size=4, offset=-40) + + } + layout <- " + AABB + " + pdf(file=paste0("conflicts-", treenames[1],"-",treenames[2], ".pdf"), width=10, height=70) + print(t1 + t2 + theme(legend.position="none") + plot_layout(design = layout))#+ plot_layout(guides = 'none')# & theme(legend.position='bottom') + dev.off() + + +} + + + + + + From 40e63c1a5db3bd8217afd90d5981f5f7371d3927 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Fri, 27 May 2022 11:55:01 +0200 Subject: [PATCH 10/26] Remove printing of docker command --- phylociraptor-docker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylociraptor-docker b/phylociraptor-docker index 232eb61..af2d153 100755 --- a/phylociraptor-docker +++ b/phylociraptor-docker @@ -48,7 +48,7 @@ TMPFS="-v $(pwd)/.tmp:/tmp" BINDPOINTS="-v $(pwd):/home/$US/phylociraptor" REUSE="TRUE" #when this is enabled, the container is continuously updated as singularity images are pulled. So they don't have to be pulled multiple times. if [[ $REUSE == "TRUE" ]]; then - echo "docker run -it $DOCKER_USER $BINDPOINTS --privileged reslp/phylociraptor:$VER phylociraptor $@" +# echo "docker run -it $DOCKER_USER $BINDPOINTS --privileged reslp/phylociraptor:$VER phylociraptor $@" docker run -it $DOCKER_USER $BINDPOINTS --privileged reslp/phylociraptor:$VER phylociraptor $@ newid=$(docker container ls -all | grep "reslp/phylociraptor:$VER" | cut -d" " -f 1) docker commit $newid reslp/phylociraptor:$VER &> /dev/null From 390c53eea66c225fd90b08bafb44839135410e32 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Fri, 27 May 2022 11:56:06 +0200 Subject: [PATCH 11/26] Add new util plot-similarity to plot a similarity matrix of quartet differences and prepare for PCA plotting --- phylociraptor | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/phylociraptor b/phylociraptor index 6b8ff9c..6e9d525 100755 --- a/phylociraptor +++ b/phylociraptor @@ -929,7 +929,6 @@ elif args.command == "util": print(cmd) for line in execute_command(cmd, not gl_args.quiet): print(line, end="\r") - sys.exit(0) if which_util == "estimate-conflict": qs_parser = UtilParser(add_help=False) qs_parser.add_argument("-i","--intrees", action="store", default="all") @@ -969,7 +968,7 @@ elif args.command == "util": cmd += ["-b", qs_args.stopby] else: print(now(), "You need to either specify a stopping criterion (-b) or the total number of quartets (-q).") - sys.exit(0) + sys.exit(1) if debug: @@ -1024,10 +1023,10 @@ elif args.command == "util": sys.exit(0) if not qs_args.intrees: print(now(),"No input trees specified") - sys.exit(0) + sys.exit(1) elif len(qs_args.intrees.split(",")) != 2: print(now(), "Need exactly two trees for comparison, seperated by a comma (,).") - sys.exit(0) + sys.exit(1) else: all_trees = qs_args.intrees if not qs_args.quartetfile: @@ -1040,6 +1039,23 @@ elif args.command == "util": cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-tree.R"] cmd += ["conflicts", os.getcwd(), qs_args.seed, all_trees, qs_args.outgroup, qs_args.lineagefile, qs_args.level, qs_args.quartetfile, qs_args.treelist] + if debug: + print(cmd) + for line in execute_command(cmd, not qs_args.quiet): + print(line, end="\r") + if which_util == "plot-similarity": + qs_parser = UtilParser(add_help=False) + qs_parser.add_argument("-s","--simmatrix", action="store", default=None) + qs_parser.add_argument("-r", "--treelist", action="store", default="none") + qs_parser.add_argument("--quiet", action="store_true", default=False) + qs_args = qs_parser.parse_args(args.arguments) + if qs_args.help: + print(help_message(util_plot_similarity_help)) + sys.exit(0) + if not qs_args.simmatrix: + print(now(), "You have to the similarity matrix file created with phylociraptor util estimate-conflict") + sys.exit(1) + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-similarity-matrix.R", os.getcwd(), qs_args.simmatrix, qs_args.treelist] if debug: print(cmd) for line in execute_command(cmd, not qs_args.quiet): @@ -1048,6 +1064,4 @@ else: print("Runmode not recognized: %s" % args.command) print("Please run phylociraptor -h to see avilable options") - - print(now(), "DONE") From ffd78500f60fdef7a4cfeac55c4f8751290e96e2 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Fri, 27 May 2022 11:56:31 +0200 Subject: [PATCH 12/26] Add help for new plot-similarity util --- bin/phylociraptor_modules/usageinfo.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/bin/phylociraptor_modules/usageinfo.py b/bin/phylociraptor_modules/usageinfo.py index 0823bda..40a81bb 100644 --- a/bin/phylociraptor_modules/usageinfo.py +++ b/bin/phylociraptor_modules/usageinfo.py @@ -290,8 +290,8 @@ def hi(): Naming follows the first column in the *.treelist.csv file. Example: -i T1-T5 will plot conflicts between T1 and T5 in the treelist file. - -q, --quartetfile *.quartets.csv file from phylociraptor util estimate-conflicts. - -r, --treelist *.treelist.csv file from phylociraptor util estimate-conflicts. + -q, --quartetfile *.quartets.csv file from phylociraptor util estimate-conflict. + -r, --treelist *.treelist.csv file from phylociraptor util estimate-conflict. Optional Arguments: -s, --seed Random seed number for reproducibility. @@ -304,3 +304,16 @@ def hi(): --quiet Suppress on-screen output. """ +util_plot_similarity_help = """ +phylociraptor util plot-similarity - Create similarity heatmap of trees. + +Usage: phylociraptor util plot-similarity + +Required Arguments: + -q, --simmatrix *.similarity_matrix.csv file from phylociraptor util estimate-conflict. + +Optional Arguments: + -r, --treelist *.treelist.tsv file from phylociraptor util estimate-conflict to create tree comparison PCA. + --quiet Suppress on-screen output. + +""" From 16b3965bf35518fcaceb3d53f82b8ff22adeb471 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Mon, 30 May 2022 09:12:33 +0200 Subject: [PATCH 13/26] Add additional usage info --- bin/phylociraptor_modules/usageinfo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/phylociraptor_modules/usageinfo.py b/bin/phylociraptor_modules/usageinfo.py index 40a81bb..cae093d 100644 --- a/bin/phylociraptor_modules/usageinfo.py +++ b/bin/phylociraptor_modules/usageinfo.py @@ -38,6 +38,7 @@ def hi(): report Create a HTML report of the run check Quickly check status of the run + util Utilities for a posteriori analyses of trees -v, --version Print version -h, --help Display help @@ -216,6 +217,7 @@ def hi(): estimate-conflict estimate conflict between trees based on the occurence of quartets. plot-tree plot one or more trees. plot-conflict plot conflicts between two trees based on quartet comparison. + plot-similarity plot heatmap of quartet similarity. -h, --help Display help. """ @@ -310,7 +312,7 @@ def hi(): Usage: phylociraptor util plot-similarity Required Arguments: - -q, --simmatrix *.similarity_matrix.csv file from phylociraptor util estimate-conflict. + -s, --simmatrix *.similarity_matrix.csv file from phylociraptor util estimate-conflict. Optional Arguments: -r, --treelist *.treelist.tsv file from phylociraptor util estimate-conflict to create tree comparison PCA. From 39d0af2805b37dcf7c63c4e193902a5108617825 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Mon, 30 May 2022 09:13:12 +0200 Subject: [PATCH 14/26] Refine script. Add possibility to have detailed tree information --- bin/plot-similarity-matrix.R | 94 ++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 21 deletions(-) diff --git a/bin/plot-similarity-matrix.R b/bin/plot-similarity-matrix.R index 71dc367..2056210 100644 --- a/bin/plot-similarity-matrix.R +++ b/bin/plot-similarity-matrix.R @@ -18,11 +18,32 @@ data <- read.csv(matrix_file) data <- melt(data) colnames(data) <- c("First", "Second", "Similarity") data$First <- factor(data$First, levels=levels(data$Second)) - -pdf(file="similarity-matrix.pdf", width=20, height=10) -ggplot(data, aes(First, Second, fill=Similarity)) + geom_tile() + ggtitle("% similarity of pairs of trees") + geom_text(aes(label = format(round(Similarity, 4), nsmall=2))) -dev.off() +if (treelistfile == "none") { + cat("Plotting without detailed tree name information.\n") + pdf(file="similarity-matrix.pdf", width=20, height=10) + p <- ggplot(data, aes(First, Second, fill=Similarity)) + geom_tile() + scale_fill_gradient(low = "red", high = "white") + ggtitle("% similarity of pairs of trees") + geom_text(aes(label = format(round(Similarity, 4), nsmall=2))) + print(p) + dev.off() +} else { + cat("Using detailed tree name information for plotting.\n") + treelist <- read.csv(treelistfile, sep="\t", header=F) + newtreenames <- c() + for (names in strsplit(treelist$V2,"/")){ + newtreenames <-c(newtreenames, paste0(names[3], "-", names[4], "-", strsplit(names[2], "-")[[1]][2])) + } + treelist$V2 <- newtreenames + data$First <- treelist$V2[match(data$First, treelist$V1)] + data$Second <- treelist$V2[match(data$Second, treelist$V1)] + pdf(file="similarity-matrix.pdf", width=20, height=10) + p <- ggplot(data, aes(First, Second, fill=Similarity)) + geom_tile() + scale_fill_gradient(low = "red", high = "white") + ggtitle("% similarity of pairs of trees") + geom_text(aes(label = format(round(Similarity, 4), nsmall=2)))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + print(p) + dev.off() + +} cat("Plot file: similarity-matrix.pdf created successfully.\n") +quit() + + if (treelistfile != "none") { @@ -43,8 +64,6 @@ if (treelistfile != "none") { } ### - - treelist <- read.csv(treelistfile, header=F, check.names=FALSE, sep="\t") colnames(treelist) <- c("tree", "path") @@ -68,38 +87,71 @@ if (treelistfile != "none") { dists <- list() cat("Now calculating distances, please be patient...\n") - for (i in 1:length(trees)) { +system.time( + for (i in 1:1) { cat(paste0("Calculating pairwise distances for tree ", i, "...\n")) dist <- cophenetic.phylo(trees[[i]]) melted_dist <- melt_dist(dist) + print(nrow(melted_dist)) visited <- c() remaining_tips <- all_tips names <- c() distances <- c() - for (sp1 in all_tips) { - visited <- c(visited, sp1) - remaining_tips <- remaining_tips[!remaining_tips %in% visited] - for (sp2 in remaining_tips) { - if (verbose == TRUE) { - cat(paste0("Tree ",i, ": ", sp1, " - ", sp2, "\n")) - } - if (sp1 == sp2) {next} + all_combinations <- t(combn(all_tips, 2)) + #nrow(all_combinations) + for (j in 1:100) { + sp1 <- all_combinations[j, 1] + sp2 <- all_combinations[j, 2] + if (verbose == TRUE) { + cat(paste0("Tree ",i, ": ", sp1, " - ", sp2, "\n")) + } + if (sp1 %in% melted_dist$iso1 && sp2 %in% melted_dist$iso2) { distance <- subset(melted_dist, (iso1==sp1 & iso2==sp2))$dist - if (length(distance) == 0) { - distance <- subset(melted_dist, (iso1==sp2 & iso2==sp1))$dist - } - if (length(distance) == 0) {distance <- NA} - distances <- c(distances, distance) - names <- c(names, paste0(sp1, "-",sp2)) + } else if (sp2 %in% melted_dist$iso1 && sp1 %in% melted_dist$iso2) { + distance <- subset(melted_dist, (iso1==sp2 & iso2==sp1))$dist + } else { + distance <- NA } + distances <- c(distances, distance) + names <- c(names, paste0(sp1, "-",sp2)) } df <- data.frame(distances) rownames(df) <- names colnames(df) <- paste0("tree", i) dists[[i]] <- df } +) + + +sp2 %in% melted_dist$iso1 +sp1 %in% melted_dist$iso1 +df + # for (sp1 in all_tips[1:10]) { + # visited <- c(visited, sp1) + # remaining_tips <- remaining_tips[!remaining_tips %in% visited] + # for (sp2 in remaining_tips[1:10]) { + # if (verbose == TRUE) { + # cat(paste0("Tree ",i, ": ", sp1, " - ", sp2, "\n")) + # } + # if (sp1 == sp2) {next} + # distance <- subset(melted_dist, (iso1==sp1 & iso2==sp2))$dist + # if (length(distance) == 0) { + # distance <- subset(melted_dist, (iso1==sp2 & iso2==sp1))$dist + # } + # if (length(distance) == 0) {distance <- NA} + # distances <- c(distances, distance) + # names <- c(names, paste0(sp1, "-",sp2)) + # } + # } + # df <- data.frame(distances) + # rownames(df) <- names + # colnames(df) <- paste0("tree", i) + # dists[[i]] <- df + } + dists complete_df <- do.call("cbind", dists) complete_df <- t(complete_df) + complete_df write.csv(file="pairwise-distance-matrix.csv", complete_df, sep=",") } From 604cb2dfda49ed95106eb7ffc17bfd94b58ca6d5 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Mon, 30 May 2022 11:13:18 +0200 Subject: [PATCH 15/26] Update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 8ea8509..92f1a3b 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ docs/.DS_Store docs/*/.DS_Store bin/phylociraptor_modules/__pycache__/ *.pdf +*.csv +*.txt +*.tsv From 36e20c28b34253d69f9eb1b12a2e1eec2ba7a640 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 1 Jun 2022 09:46:49 +0200 Subject: [PATCH 16/26] Extend plot similarity functionality. Now tip2tip distances of trees can be calculated and plotted as PCA --- bin/phylociraptor_modules/usageinfo.py | 11 +- bin/plot-similarity-matrix.R | 210 ++++++++++++++++--------- phylociraptor | 10 +- 3 files changed, 154 insertions(+), 77 deletions(-) diff --git a/bin/phylociraptor_modules/usageinfo.py b/bin/phylociraptor_modules/usageinfo.py index cae093d..74cca35 100644 --- a/bin/phylociraptor_modules/usageinfo.py +++ b/bin/phylociraptor_modules/usageinfo.py @@ -312,10 +312,17 @@ def hi(): Usage: phylociraptor util plot-similarity Required Arguments: - -s, --simmatrix *.similarity_matrix.csv file from phylociraptor util estimate-conflict. + -m, --simmatrix *.similarity_matrix.csv file from phylociraptor util estimate-conflict. Optional Arguments: - -r, --treelist *.treelist.tsv file from phylociraptor util estimate-conflict to create tree comparison PCA. + -r, --treelist *.treelist.tsv file from phylociraptor util estimate-conflict. + When this file is provided, a tip2tip distance analysis will be performed. + The results will be plotted as PCA and provided as CSV file. + -s, --seed Random seed for reproducibility. + -n, --ndistances Number of tip2tip distances to be calculated. + Only meaningful when combined with -r/--treelist + -t, --threads Number of threads for tip2tip distance analysis. + Only meaningful when combined with -r/--treelist --quiet Suppress on-screen output. """ diff --git a/bin/plot-similarity-matrix.R b/bin/plot-similarity-matrix.R index 2056210..890149b 100644 --- a/bin/plot-similarity-matrix.R +++ b/bin/plot-similarity-matrix.R @@ -1,15 +1,28 @@ -library(ggplot2) -library(reshape2) -library(ape) -library(reshape2) -library(tidyverse) +options(warn = -1) +suppressMessages(library(ggplot2)) +suppressMessages(library(reshape2)) +suppressMessages(library(ape)) +suppressMessages(library(reshape2)) +suppressMessages(library(tidyverse)) +suppressMessages(library(parallel)) args <- commandArgs(trailingOnly=TRUE) wd <- args[1] matrix_file <- args[2] treelistfile <- args[3] -verbose <- TRUE +threads <- strtoi(args[4]) +ndistances <- args[5] +randomseed <- args[6] + +verbose <- FALSE +#if (quiet == "true") { +# verbose <- FALSE +#} +if (randomseed != "random"){ + cat(paste0("Random seed: ", randomseed, "\n")) + set.seed(strtoi(randomseed)) +} setwd(wd) cat(paste("Working directory: ", getwd(), "\n")) @@ -34,20 +47,18 @@ if (treelistfile == "none") { treelist$V2 <- newtreenames data$First <- treelist$V2[match(data$First, treelist$V1)] data$Second <- treelist$V2[match(data$Second, treelist$V1)] - pdf(file="similarity-matrix.pdf", width=20, height=10) + p <- ggplot(data, aes(First, Second, fill=Similarity)) + geom_tile() + scale_fill_gradient(low = "red", high = "white") + ggtitle("% similarity of pairs of trees") + geom_text(aes(label = format(round(Similarity, 4), nsmall=2)))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + + pdf(file=paste0("quartet-similarity-heatmap-",length(unique(data$First)),"-trees.pdf"), width=20, height=10) print(p) dev.off() } -cat("Plot file: similarity-matrix.pdf created successfully.\n") -quit() - - - +cat("\nA treelist file was specified. Will therefore also calculate tip 2 tip distances as a measure of tree similarity.\n") if (treelistfile != "none") { - ### taken from: https://github.com/andersgs/harrietr/blob/master/R/melt_distance.R + ### the melt_dist function is from: https://github.com/andersgs/harrietr/blob/master/R/melt_distance.R melt_dist <- function(dist, order = NULL, dist_name = 'dist') { if(!is.null(order)){ dist <- dist[order, order] @@ -67,7 +78,7 @@ if (treelistfile != "none") { treelist <- read.csv(treelistfile, header=F, check.names=FALSE, sep="\t") colnames(treelist) <- c("tree", "path") - cat("Reading trees from treelist file:\n") + cat("Reading trees from treelist file.\n") trees <- list() i <- 1 for (treepath in treelist$path) { @@ -75,83 +86,138 @@ if (treelistfile != "none") { trees[[i]]<-read.tree(treepath) i <- i + 1 } + cat(paste0("Found ", length(trees), " trees.\n")) all_tips <- c() - cat("Extracting all tips from tree...") + cat("Extracting all tips from all trees...\n") for (i in 1:length(trees)) { all_tips <- c(all_tips, trees[[i]]$tip.label) } all_tips <- unique(all_tips) - cat("done\n") + cat("Creating pairwise combinations of tips...\n") + all_combinations <- t(combn(all_tips, 2)) - dists <- list() - cat("Now calculating distances, please be patient...\n") -system.time( - for (i in 1:1) { - cat(paste0("Calculating pairwise distances for tree ", i, "...\n")) + for (k in 1:nrow(all_combinations)){ #sort species pairs alphabetically + sp1 <- all_combinations[k, 1] + sp2 <- all_combinations[k, 2] + all_combinations[k, 1] <- paste0(sort(c(sp1, sp2))[1], "-", sort(c(sp1, sp2))[2]) + #all_combinations[k, 2] <- sort(c(sp1, sp2))[2] + } + if (ndistances != "all") { + cat("Will choose a random sample of tip-combinations.\n") + all_combinations <- all_combinations[sample(nrow(all_combinations), strtoi(ndistances)),] + } + cat(paste0("Number of tip-pairs used to calculate distances: ", length(all_combinations[,1]), "\n")) + + + + cat("Now calculating tip2tip distances, please be patient this could take a few minutes per tree...\n") + cat(paste0("Will be using ", threads, " threads for this step.\n")) + + dist_single_tree <- function(i) { + cat(paste0(Sys.time(), " Calculating pairwise distances for tree ", i, "...\n")) + + trees[[i]]$edge.length[trees[[i]]$edge.length == "NaN"] <- 0 dist <- cophenetic.phylo(trees[[i]]) - melted_dist <- melt_dist(dist) - print(nrow(melted_dist)) - visited <- c() - remaining_tips <- all_tips + + melted_dist <- melt(dist) + melted_dist <- as.matrix(melted_dist) #convert to matrix; this is faster than df + combined <- c() + combined[1:nrow(melted_dist)] <- 1:nrow(melted_dist) + for (k in 1:nrow(melted_dist)){ #sort species pairs alphabetically + sp1 <- melted_dist[k, 1] # matrix is faster than df + sp2 <- melted_dist[k, 2] + combined[k] <- paste(sort(c(sp1, sp2)), collapse="-") + } + melted_dist <- as.data.frame(melted_dist) + melted_dist$combined <- combined + #remove duplicated rows for faster processing + melted_dist$Var1 <- NULL + melted_dist$Var2 <- NULL + melted_dist <- unique(melted_dist) + + cat(paste0(Sys.time()," Checking tip2tip distances from tree ", i, " against all combinations from all trees...\n")) names <- c() distances <- c() - all_combinations <- t(combn(all_tips, 2)) - #nrow(all_combinations) - for (j in 1:100) { - sp1 <- all_combinations[j, 1] - sp2 <- all_combinations[j, 2] + names[1:nrow(all_combinations)] <- 1:nrow(all_combinations) + distances[1:nrow(all_combinations)] <- 1:nrow(all_combinations) + + for (j in 1:nrow(all_combinations)) { + comb <- all_combinations[j, 1] if (verbose == TRUE) { - cat(paste0("Tree ",i, ": ", sp1, " - ", sp2, "\n")) + cat(paste0("Tree ", i, " Combination: ", j, "/", nrow(all_combinations))) } - if (sp1 %in% melted_dist$iso1 && sp2 %in% melted_dist$iso2) { - distance <- subset(melted_dist, (iso1==sp1 & iso2==sp2))$dist - } else if (sp2 %in% melted_dist$iso1 && sp1 %in% melted_dist$iso2) { - distance <- subset(melted_dist, (iso1==sp2 & iso2==sp1))$dist - } else { + distance <- melted_dist$value[melted_dist$combined == comb] + if (length(distance) == 0) { distance <- NA } - distances <- c(distances, distance) - names <- c(names, paste0(sp1, "-",sp2)) + distances[j] <- distance + names[j] <- comb } - df <- data.frame(distances) + cat(paste0(Sys.time(), " Distances extracted for tree ", i, ": ", length(distances), "\n")) + df <- data.frame(as.numeric(distances)) rownames(df) <- names colnames(df) <- paste0("tree", i) - dists[[i]] <- df - } -) - - -sp2 %in% melted_dist$iso1 -sp1 %in% melted_dist$iso1 -df - # for (sp1 in all_tips[1:10]) { - # visited <- c(visited, sp1) - # remaining_tips <- remaining_tips[!remaining_tips %in% visited] - # for (sp2 in remaining_tips[1:10]) { - # if (verbose == TRUE) { - # cat(paste0("Tree ",i, ": ", sp1, " - ", sp2, "\n")) - # } - # if (sp1 == sp2) {next} - # distance <- subset(melted_dist, (iso1==sp1 & iso2==sp2))$dist - # if (length(distance) == 0) { - # distance <- subset(melted_dist, (iso1==sp2 & iso2==sp1))$dist - # } - # if (length(distance) == 0) {distance <- NA} - # distances <- c(distances, distance) - # names <- c(names, paste0(sp1, "-",sp2)) - # } - # } - # df <- data.frame(distances) - # rownames(df) <- names - # colnames(df) <- paste0("tree", i) - # dists[[i]] <- df + return(df) } - dists + Sys.time() + dists <- list() + dists <- mclapply(1:length(trees), dist_single_tree, mc.cores=threads) + Sys.time() + complete_df <- do.call("cbind", dists) - complete_df <- t(complete_df) - complete_df - write.csv(file="pairwise-distance-matrix.csv", complete_df, sep=",") + complete_df <- t(complete_df) + + #complete_df <- t(complete_df) +# for (i in 1:12) { +# complete_df[,i] <- as.numeric(complete_df[,i]) +# } + + cat("Calculating PCA of tip2tip distances now.\n") + pca <-prcomp(complete_df,na.action = na.omit) + sumpca <- summary(pca) + PC1Variance <- sumpca$importance[2,1]*100 + PC2Variance <- sumpca$importance[2,2]*100 + + plotdf <- as.data.frame(pca$x) + tnames <- rownames(plotdf) + plotdf["treenames"] <- tnames + + treebuilders <- c() + for (x in strsplit(newtreenames, "-")){ + treebuilders <- c(treebuilders, x[1]) + + } + alitrim <- c() + for (x in strsplit(newtreenames, "-")){ + alitrim <- c(alitrim, paste0(x[2],"-",x[3])) + + } + bootstrapcutoffs <- c() + for (x in strsplit(newtreenames, "-")){ + bootstrapcutoffs <- c(bootstrapcutoffs, x[4]) + + } + plotdf$treebuilder <- treebuilders + plotdf$bscutoff <- bootstrapcutoffs + plotdf$alitrim <- alitrim + + p <- ggplot(data = plotdf, aes(x=PC1, y=PC2, label=treenames, color=alitrim, shape=treebuilder, size=bscutoff)) + geom_point() +xlab(paste0("PC1 (", PC1Variance, "%)")) +ylab(paste0("PC2 (", PC2Variance,"%)")) +ggtitle(paste0("Similarity of trees based on tip to tip distance matrices.\nBased on ", ndistances, " distances between tips."))+ scale_shape_manual(values = c(15, 16, 17, 18))+ scale_color_brewer(palette="Set1") + + pdf(file=paste0("tip2tip-PCA-",ndistances,".pdf")) + print(p) + dev.off() + write.csv(file=paste0("pairwise-distance-matrix-",length(trees),"-trees.csv"), complete_df, sep=",", quote=FALSE) + +} # if statement end + +cat("Similarity plotting is done.\n") +cat('Output: quartet-similarity-heatmap-*trees.pdf - Heatmap of % similarity based on quartets of tips. * is the number of trees in comparison.\n') +if (treelistfile != "none") { + cat('Output: tip2tip-PCA-*.pdf - PCA of tip to tip distances in trees. * will be the number of distances specified with --ndistances\n') + cat('Output: pairwise-distance-matrix-*-trees.csv - tip2tip distance matrix used to create the PCA. * will be the number of trees in comparison.\n') } + + diff --git a/phylociraptor b/phylociraptor index 6e9d525..a2b47e6 100755 --- a/phylociraptor +++ b/phylociraptor @@ -1045,17 +1045,21 @@ elif args.command == "util": print(line, end="\r") if which_util == "plot-similarity": qs_parser = UtilParser(add_help=False) - qs_parser.add_argument("-s","--simmatrix", action="store", default=None) + qs_parser.add_argument("-m","--simmatrix", action="store", default=None) qs_parser.add_argument("-r", "--treelist", action="store", default="none") - qs_parser.add_argument("--quiet", action="store_true", default=False) + qs_parser.add_argument("--quiet", action="store_true", default=False) + qs_parser.add_argument("-t", "--threads", action="store", default="1") + qs_parser.add_argument("-n", "--ndistances", action="store", default="all") + qs_parser.add_argument("-s", "--seed", action="store", default="random") qs_args = qs_parser.parse_args(args.arguments) + print(qs_args) if qs_args.help: print(help_message(util_plot_similarity_help)) sys.exit(0) if not qs_args.simmatrix: print(now(), "You have to the similarity matrix file created with phylociraptor util estimate-conflict") sys.exit(1) - cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-similarity-matrix.R", os.getcwd(), qs_args.simmatrix, qs_args.treelist] + cmd = ["singularity", "exec", "-B", os.getcwd(), "docker://reslp/rphylogenetics:4.0.3", "Rscript", "bin/plot-similarity-matrix.R", os.getcwd(), qs_args.simmatrix, qs_args.treelist, qs_args.threads, qs_args.ndistances, qs_args.seed] if debug: print(cmd) for line in execute_command(cmd, not qs_args.quiet): From e059c57a16a4c39e3c03a467c2424fe63023ae77 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 1 Jun 2022 15:45:29 +0200 Subject: [PATCH 17/26] Update documentation to include information about phylociraptor util --- docs/indepth/util.rst | 100 +++++++++++++++++++++++++++++ docs/index.rst | 1 + docs/introduction/installation.rst | 1 + 3 files changed, 102 insertions(+) create mode 100644 docs/indepth/util.rst diff --git a/docs/indepth/util.rst b/docs/indepth/util.rst new file mode 100644 index 0000000..9f03084 --- /dev/null +++ b/docs/indepth/util.rst @@ -0,0 +1,100 @@ + +.. role:: bash(code) + :language: bash + +============================================= +A posteriori analyses with phylociraptor util +============================================= + +Phylociraptor can create lots of trees quickly. Which tree is the correct one? There is no good answer to this question, however with functions provided by :bash:`phylociraptor util` it is possible to compare similarity of trees and impose lineage information from NCBI on tree plots. + +What is examplained here requires phylociraptor to have successfully produced trees. The provided utilities aim to give an overview about tree similarity and should help to identify problematic or unstable areas in the trees. However they are not thorough comparisons of trees. There are great dedicated software packages for this task. For example: https://ms609.github.io/TreeDist/. + +--------------------------------------------------- +Download taxonomic lineage information from NCBI +--------------------------------------------------- + +It is possible to download lineage information with :bash:`phylociraptor util get-lineage` for all previously downloaded genomes. You can look at the options for this command with :bash:`./phylociraptor util get-lineage -h`. + +The output of this command can later be used when plotting trees and to estimate which taxonomic groups have been recovered as monophyletic in different trees. + +Here is a full example command: + +.. code-block:: bash + + $ ./phylociraptor util get-lineage --genomefile results/statistics/downloaded_genomes_statistics.txt --outfile lineage_information.txt + +----------------------------------------------------------------- +Estimate conflicts between trees based on subtrees with four tips +----------------------------------------------------------------- + +It can be difficult to compare topological differences between large phylogenetic trees and there are several reasons for that. Trees can differ in the sets of included taxa and many commonly used metrics reduce tree-similarity to a single value. While there is nothing wrong with this approach, it would also be nice to be able to compare where in the tree conflicts are located. Here we adopt a strategy to be able to estimate conflict between trees despite possible differences in the number of tips while also being able to get an idea where the conflicting positions of the tree are. The method is based on comparing trees based on quartets of tips. + +Here are a few example commands: + +.. code-block:: bash + + $ ./phylociraptor util estimate-conflict --outprefix quartet-5000 -n 5000 + $ ./phylociraptor util estimate-conflict --outprefix quartet-100 --stopby conflicts=100 --seed 120 + $ ./phylociraptor util estimate-conflict --outprefix quartets-conflict --stopby tipcoverage=200 -l lineages.txt --selecttaxa order=Diperta,Hymenoptera -t 20 + +The first command will estimate conflicts based on 5000 random quartets. Output files will have the prefix quartet-5000. + +The second command will estimate conflicts until 100 conflicting quartets have been identified. It also uses a random seed so that the same quartets will be compared during each run. + +The third example samples quartets till each tip has been included in quartets at least 200 times. It also uses the lineage information file and only calculates quartets from the orders Diperta and Hymenoptera. It runs this analysis using 20 threads. + +----------------------- +Plot phylogenomic trees +----------------------- + +With :bash:`phylociraptor util plot-tree` you can visualize your trees. This utility output PDF files with tree images. + +Here are some example commands: + +.. code-block:: bash + + $ ./phylociraptor util plot-tree + $ ./phylociraptor util plot-tree -i results/phylogeny-75/iqtree/mafft-aliscore/concat.contree -l lineage_arthropoda.txt -e family + +The first command is the simplest plotting command. It create plots for all trees it finds in the :bash:`results/` directory. + +The second command will create only a single plot for the iqtree tree with mafft and aliscore alignments. It will highlight different families in color by using the information from the lineage file. When a lineage file and a taxonomic level are specified then a second plot will be produced indicating if the different groups are recovered as monophyletic in the tree. + + + +----------------------------------------------- +Plotting conflicts based on quartet differences +----------------------------------------------- + +With this utility it is possible to get a visual idea about conflicts between to trees based on quartets. The plotted tree will look similar to trees plotted with :bash:`phylociraptor util ploz-tree`. The difference is that in this case two trees are plotted instead of one and the conflicts between them are highlighted by thicker branches. The thickness of a branch indicates in how many conflicting quartets this branch is included in. Thicker branches thus mean more conflict. + +Again, here are several example commands: + +.. code-block:: bash + + $ ./phylociraptor util plot-conflict -i T1,T10 --quartetfile quartet-5000.quartets.csv -r quartet-5000.treelist.tsv + $ ./phylociraptor util plot-conflict -i T2,T23 --quartetfile quartet-5000.quartets.csv -r quartet-5000.treelist.tsv --seed 123 --outgroup Ramazzottius_varieornatus,Hypsibius_dujardini + +The first example compares trees 1 and 10. The two parameters --quartetfile and -r are mandatory. + +The second command compares trees 2 and 23. It uses a random seed for reproducibility and the trees will be rooted with Ramazzottius_varieornatus and Hypsibius_dujardini before they are plotted. + +-------------------------------------------------------------------------------- +Plotting similarity of trees based on quartet occurences and tip 2 tip distances +-------------------------------------------------------------------------------- + +This creates an overview heatmap of quartet conflicts. Additionally this utility can also calculate tip2tip distances for all trees and plot the differences as an annotated PCA for quick comparison of trees. + +A few example commands should make it more clear: + +.. code-block:: bash + + $ ./phylociraptor util plot-similarity -m quartet-5000.similarity_matrix.csv + $ ./phylociraptor util plot-similarity -m quartet-5000.similarity_matrix.csv -r quartet-5000.treelist.tsv -s 123 --ndistances 100 -t 8 + +The first command will only create a heatmap of overall similarity of quartets in percent between trees. + +The second command will create the heatmap but will also compute tip2tip distances for 100 pairs of tips using eight threads and a random seed 123. These tip2tip distance matrices will be plotted as a PCA. + + diff --git a/docs/index.rst b/docs/index.rst index 50d9fdb..a835f00 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -67,6 +67,7 @@ If you run into problems with phylociraptor please follow these steps: indepth/config_files.rst indepth/indepth.rst indepth/output.rst + indepth/util.rst .. toctree:: :caption: Getting help diff --git a/docs/introduction/installation.rst b/docs/introduction/installation.rst index 8d65dd0..969feba 100644 --- a/docs/introduction/installation.rst +++ b/docs/introduction/installation.rst @@ -51,6 +51,7 @@ The probably best way is to clone the repository directly using git (if availabl report Create a HTML report of the run check Quickly check status of the run + util Utilities for a posteriori analyses of trees -v, --version Print version -h, --help Display help From b24b8afde8c0cae29c1bf7eae022da30900ca127 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 1 Jun 2022 15:46:27 +0200 Subject: [PATCH 18/26] Improvements for usage information of phylociraptor util --- bin/phylociraptor_modules/usageinfo.py | 40 ++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/bin/phylociraptor_modules/usageinfo.py b/bin/phylociraptor_modules/usageinfo.py index 74cca35..7b10044 100644 --- a/bin/phylociraptor_modules/usageinfo.py +++ b/bin/phylociraptor_modules/usageinfo.py @@ -213,7 +213,7 @@ def hi(): Usage: phylociraptor util Argumemts: - get-lineage retreive full lineage information for all included samples. + get-lineage retrieve full lineage information for all included samples. estimate-conflict estimate conflict between trees based on the occurence of quartets. plot-tree plot one or more trees. plot-conflict plot conflicts between two trees based on quartet comparison. @@ -243,10 +243,9 @@ def hi(): Usage: phylociraptor util estimate-conflict Required Arguments: - -i, --intrees Relative paths to input trees in results folder. (Default: all) - Default: results/statistics/downloaded_genomes_statistics.txt + -i, --intrees Relative paths to input trees in results folder, seperated by commas. (Default: all) -o, --outprefix Output file name prefix. - -q, --nquartets Total number of quartets to be calculated. Use either this or --stopby. + -n, --nquartets Total number of quartets to be calculated. Use either this or --stopby. -b, --stopby Stoping criterion. There are two options: -b conflicts=100 stops as soon as 100 conflicts have been found. --stopby tipcoverage=100 stops as soon as every tip is in at least 100 quartets. @@ -254,7 +253,8 @@ def hi(): -s, --seed Random seed number for reproducibility. (Default: random) -l, --lineagefile Lineagefile created with phylociraptor util get-lineage. Mandatory when using-a/--selecttaxa. - -a, --selecttaxa Sample quartets only from tips beloning to specific taxa. + -a, --selecttaxa Sample quartets only from tips belonging to specific taxa. + Requires output from phylociraptor util get-lineage. Examples: --selecttaxa order=Diperta,Hymenoptera -t, --threads Number of threads. (Default: 1) @@ -263,21 +263,21 @@ def hi(): """ util_plot_tree_help = """ -phylociraptor util plot-tree - Estimates conflicts between trees based on quartets. +phylociraptor util plot-tree - Plot one or more phylogenomic trees. Usage: phylociraptor util plot-tree Required Arguments: - -i, --intrees Relative paths to input trees in results folder. (Default: all) - Default: results/statistics/downloaded_genomes_statistics.txt + -i, --intrees Relative paths to input trees in results folder, separated by commas. (Default: all) + +Optional Arguments: -o, --outprefix Output file name prefix. -l, --lineagefile Lineagefile created with phylociraptor util get-lineage. -e, --level Taxonomic level in lineage file which should be plotted. - -Optional Arguments: - -s, --seed Random seed number for reproducibility. + -s, --seed Random seed number for reproducibility. (Default: random) -g, --outgroup Comma seperated list of tips which should be used as Outgroup. Trees will be rerooted accordingly. + (Default: none) --quiet Suppress on-screen output. """ @@ -288,26 +288,29 @@ def hi(): Usage: phylociraptor util plot-conflict Required Arguments: - -i, --intrees Two trees for which conflicts should be visualized. + -i, --intrees Two trees for which conflicts should be visualized, separated by a dash. Naming follows the first column in the *.treelist.csv file. - Example: -i T1-T5 will plot conflicts between T1 and T5 in the + Example: -i T1,T5 will plot conflicts between T1 and T5 in the treelist file. -q, --quartetfile *.quartets.csv file from phylociraptor util estimate-conflict. -r, --treelist *.treelist.csv file from phylociraptor util estimate-conflict. Optional Arguments: - -s, --seed Random seed number for reproducibility. + -s, --seed Random seed number for reproducibility. (Default: random) -l, --lineagefile Lineagefile created with phylociraptor util get-lineage. Is required when -e/--level should be used. + (Default: none) -e, --level Taxonomic level in lineage file which should be plotted. Has to be used together with -l/--lineagefile. + (Default: none) -g, --outgroup Comma seperated list of tips which should be used as Outgroup. Trees will be rerooted accordingly. + (Default: none) --quiet Suppress on-screen output. """ util_plot_similarity_help = """ -phylociraptor util plot-similarity - Create similarity heatmap of trees. +phylociraptor util plot-similarity - Create similarity heatmap for trees and calculate tip2tip distances. Usage: phylociraptor util plot-similarity @@ -318,10 +321,11 @@ def hi(): -r, --treelist *.treelist.tsv file from phylociraptor util estimate-conflict. When this file is provided, a tip2tip distance analysis will be performed. The results will be plotted as PCA and provided as CSV file. - -s, --seed Random seed for reproducibility. - -n, --ndistances Number of tip2tip distances to be calculated. + (Default: none) + -s, --seed Random seed for reproducibility. (Default: random) + -n, --ndistances Number of tip2tip distances to be calculated. (Default: all) Only meaningful when combined with -r/--treelist - -t, --threads Number of threads for tip2tip distance analysis. + -t, --threads Number of threads for tip2tip distance analysis. (Default: 1) Only meaningful when combined with -r/--treelist --quiet Suppress on-screen output. From eaa8cb0e38ab795955cb999a9fe292c0b9aff61e Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 1 Jun 2022 16:02:10 +0200 Subject: [PATCH 19/26] Code cleanup for plot-tree --- bin/plot-tree.R | 101 ++++++++++++++---------------------------------- 1 file changed, 28 insertions(+), 73 deletions(-) diff --git a/bin/plot-tree.R b/bin/plot-tree.R index eea235f..279bd7f 100644 --- a/bin/plot-tree.R +++ b/bin/plot-tree.R @@ -1,13 +1,13 @@ -library(phytools, quietly = T) -library(ape, quietly = T) -library(ggtree, quietly = T) -library(magrittr, quietly = T) -library(tidytree, quietly = T) -library(patchwork, quietly = T) -library(cowplot, quietly = T) -#library(stringr) -library(ggplot2, quietly = T) -library(reshape2, quietly = T) +options(warn=-1) +suppressMessages(library(phytools)) +suppressMessages(library(ape)) +suppressMessages(library(ggtree)) +suppressMessages(library(magrittr)) +suppressMessages(library(tidytree)) +suppressMessages(library(patchwork)) +suppressMessages(library(cowplot)) +suppressMessages(library(ggplot2)) +suppressMessages(library(reshape2)) args <- commandArgs(trailingOnly=TRUE) @@ -15,10 +15,8 @@ args <- commandArgs(trailingOnly=TRUE) runmode <- args[1] # different run modes should be: plot, conflicts and wd <- args[2] setwd(wd) -print(wd) -print(args) if (runmode == "plot") { - print("Will plot trees...") + cat("Will plot trees...\n") seed <- args[3] treenames <- args[4] outgroup <- args[5] @@ -45,31 +43,18 @@ if (runmode == "plot") { outgroup <- strsplit(outgroup,",")[[1]] treenames <- strsplit(treenames,",")[[1]] -print(outgroup) -print(treenames) -print(conflictfile) -print(lineage_file) +#print(outgroup) +#print(treenames) +#print(lineage_file) #set seed if specified if (seed != "random") { set.seed(seed) } - -#quit(1) - -#the settings here are for testing -#level <- "order" -#runmode <- "plot" -#treenames <- c('results/phylogeny-80/iqtree/mafft-trimal/concat.contree', 'results/phylogeny-80/iqtree/clustalo-trimal/concat.contree', 'results/phylogeny-80/iqtree/clustalo-aliscore/concat.contree', 'results/phylogeny-80/iqtree/mafft-aliscore/concat.contree', 'results/phylogeny-85/iqtree/mafft-trimal/concat.contree', 'results/phylogeny-85/iqtree/clustalo-trimal/concat.contree', 'results/phylogeny-85/iqtree/clustalo-aliscore/concat.contree', 'results/phylogeny-85/iqtree/mafft-aliscore/concat.contree', 'results/phylogeny-75/iqtree/mafft-trimal/concat.contree', 'results/phylogeny-75/iqtree/clustalo-trimal/concat.contree', 'results/phylogeny-75/iqtree/clustalo-aliscore/concat.contree', 'results/phylogeny-75/iqtree/mafft-aliscore/concat.contree', 'results/phylogeny-80/astral/mafft-trimal/species_tree.tre', 'results/phylogeny-80/astral/clustalo-trimal/species_tree.tre', 'results/phylogeny-80/astral/clustalo-aliscore/species_tree.tre', 'results/phylogeny-80/astral/mafft-aliscore/species_tree.tre', 'results/phylogeny-85/astral/mafft-trimal/species_tree.tre', 'results/phylogeny-85/astral/clustalo-trimal/species_tree.tre', 'results/phylogeny-85/astral/clustalo-aliscore/species_tree.tre', 'results/phylogeny-85/astral/mafft-aliscore/species_tree.tre', 'results/phylogeny-75/astral/mafft-trimal/species_tree.tre', 'results/phylogeny-75/astral/clustalo-trimal/species_tree.tre', 'results/phylogeny-75/astral/clustalo-aliscore/species_tree.tre', 'results/phylogeny-75/astral/mafft-aliscore/species_tree.tre') - -#outgroup <- c("Ramazzottius_varieornatus", "Hypsibius_dujardini") -#lineage_file <- "lineage_arthropods.txt" -#conflictfile <- "output.txt" - # load lineage information file and fill missing values if (lineage_file == "none") { - cat("No lineage file found.\n") + cat("No lineage file specified. Will not plot lineage information.\n") } else { lineages <- read.csv(lineage_file, header=T, sep="\t", na=c("")) lineages[is.na(lineages)] <- "missing" @@ -207,7 +192,7 @@ generate_color <- function(label) { } get_conflicts <- function(tree, conflict_quartets) { - if (length(conflict_quartets) == 0) {print("WARNING: NO CONFLICTS IN SELECTION!")} + if (length(conflict_quartets) == 0) {cat("WARNING: NO CONFLICTS IN SELECTION!\n")} edge_thickness <- rep(1, length(tree$edge.length)+1) for (quat in conflict_quartets) { left <- strsplit(quat, "-")[[1]][1] @@ -216,10 +201,6 @@ get_conflicts <- function(tree, conflict_quartets) { left2 <- strsplit(left, ",")[[1]][2] right1 <- strsplit(right, ",")[[1]][1] right2 <- strsplit(right, ",")[[1]][2] - #print(paste0(left1, "->", left2)) - #print(paste0(right1, "->", right2)) - # npleft <- nodepath(tree, from=which(tree$tip.label == left1), to=which(tree$tip.label == left2)) - # npright <- nodepath(tree, from=which(tree$tip.label == right1), to=which(tree$tip.label == right2)) #identify nodes between tips mcanodel <- getMRCA(tree, c(which(tree$tip.label == left1), which(tree$tip.label == left2))) mcanoder <- getMRCA(tree, c(which(tree$tip.label == right1), which(tree$tip.label == right2))) @@ -227,13 +208,10 @@ get_conflicts <- function(tree, conflict_quartets) { pathl2 <- nodepath(tree, from=mcanodel, to=which(tree$tip.label == left2)) pathr1 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right1)) pathr2 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right2)) - #print(paste0(mcanodel, " ", mcanoder)) npancestor <- nodepath(tree, from=mcanodel, to=mcanoder) - #print(paste0(mcanodel, " ", mcanoder)) all_nodes_to_highlight <- c(pathl1, pathl2, pathr1, pathr2, npancestor) #all_nodes_to_highlight <- c(npancestor) - #print(all_nodes_to_highlight) # now select which edges in the tree p <- ggtree(tr = tree, ladderize = FALSE) edgeorder <- data.frame(parent=p$data$parent, node=p$data$node) @@ -256,24 +234,22 @@ get_conflicts <- function(tree, conflict_quartets) { } get_conflicts_and_support <- function(tree, conflict_quartets) { - if (length(conflict_quartets) == 0) {print("WARNING: NO CONFLICTS IN SELECTION!")} + if (length(conflict_quartets) == 0) {cat("WARNING: NO CONFLICTS IN SELECTION!\n")} edge_thickness <- rep(1, length(tree$edge.length)+1) + i <- 1 for (quat in names(conflict_quartets)) { if (quat == "") {next} # not sure why this happens, but apparently it does sometimes if (length(strsplit(quat, ",")[[1]]) < 3) { - print(paste0("Not a proper Quartet. Will skip: ", quat)) + cat(paste0("Not a proper Quartet. Will skip: ", quat, "\n")) next } + cat(paste0("\rQuartet ", i, "/", length(names(conflict_quartets)), ": ", quat)) left <- strsplit(quat, "-")[[1]][1] right <- strsplit(quat, "-")[[1]][2] left1 <- strsplit(left, ",")[[1]][1] left2 <- strsplit(left, ",")[[1]][2] right1 <- strsplit(right, ",")[[1]][1] right2 <- strsplit(right, ",")[[1]][2] - #print(paste0(left1, "->", left2)) - #print(paste0(right1, "->", right2)) - # npleft <- nodepath(tree, from=which(tree$tip.label == left1), to=which(tree$tip.label == left2)) - # npright <- nodepath(tree, from=which(tree$tip.label == right1), to=which(tree$tip.label == right2)) #identify nodes between tips mcanodel <- getMRCA(tree, c(which(tree$tip.label == left1), which(tree$tip.label == left2))) mcanoder <- getMRCA(tree, c(which(tree$tip.label == right1), which(tree$tip.label == right2))) @@ -281,13 +257,9 @@ get_conflicts_and_support <- function(tree, conflict_quartets) { pathl2 <- nodepath(tree, from=mcanodel, to=which(tree$tip.label == left2)) pathr1 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right1)) pathr2 <- nodepath(tree, from=mcanoder, to=which(tree$tip.label == right2)) - #print(paste0(mcanodel, " ", mcanoder)) npancestor <- nodepath(tree, from=mcanodel, to=mcanoder) - #print(paste0(mcanodel, " ", mcanoder)) all_nodes_to_highlight <- c(pathl1, pathl2, pathr1, pathr2) - #all_nodes_to_highlight <- c(npancestor) - #print(all_nodes_to_highlight) # now select which edges in the tree p <- ggtree(tr = tree, ladderize = FALSE) edgeorder <- data.frame(parent=p$data$parent, node=p$data$node) @@ -303,32 +275,15 @@ get_conflicts_and_support <- function(tree, conflict_quartets) { edge_thickness[edge] = edge_thickness[edge] + 1 } } - #if (conflict_quartets[quat,] == 1 ) { - # for (edge in edges_with_conflict) { - # edge_thickness[edge] = edge_thickness[edge] - 1 - # } - #} - + i <- i + 1 } thickness <- data.frame(edge=1:length(edge_thickness), conflict=edge_thickness) thickness$logthick <- log(thickness$conflict+1) return(thickness) - } -#################### -# CODE STARTS HERE # -#################### -# TODO: -# - Create Order annotations for singletons -# - Add additional support categories - - - # generate some colors - - if (runmode == "plot") { if (level == "none") { cat("Plotting tree(s) without lineage information.\n") @@ -356,9 +311,8 @@ if (runmode == "plot") { # this is where we decide how to plot (conflicts or not). Maybe this will be refactored later... cat(paste0("Plot tree: ", prefix, "\n")) if (runmode == "conflicts") { - print("Extracting Conflicts:") + cat("Extracting Conflicts:\n") conflicts_info <- get_conflicts_and_support(tree, conflict_quartets) - print(conflicts_info) t2 <- ggtree(tree, branch.length='none', aes(size=conflicts_info$conflict)) +theme(legend.position = c("none")) +geom_tiplab() @@ -539,7 +493,7 @@ if (runmode == "plot") { dev.off() } }else if (runmode == "conflicts") { - print("plotting conflicts") + cat("Plotting conflicts...") treepath1 <- treelist$path[treelist["tree"] == treenames[1]] bs_cutoff <- strsplit(strsplit(treepath1,"/")[[1]][2],"-")[[1]][2] algorithm <- strsplit(treepath1,"/")[[1]][3] @@ -569,10 +523,11 @@ if (runmode == "plot") { } cat(paste0("Plot will be based on ", as.character(length(names(conflicts_t[conflicts_t == 0]))), " conflicting quartets.\n")) - - cat("Extracting Conflicts:\n") - conflicts_info1 <- get_conflicts_and_support(tree1, conflicts_t) - conflicts_info2 <- get_conflicts_and_support(tree2, conflicts_t) + + cat("Extracting Conflicts for first tree:\n") + conflicts_info1 <- get_conflicts_and_support(tree1, conflicts_t[conflicts_t == 0]) + cat("Extracting Conflicts for second tree:\n") + conflicts_info2 <- get_conflicts_and_support(tree2, conflicts_t[conflicts_t == 0]) cat("Plot conflitcs between trees...\n") if (lineage_file != "none") { From 1c57aae3289683239bcf5e8befd1d74985cacbce Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 1 Jun 2022 16:02:53 +0200 Subject: [PATCH 20/26] Rename flag to specify quartets --- phylociraptor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phylociraptor b/phylociraptor index a2b47e6..612bc71 100755 --- a/phylociraptor +++ b/phylociraptor @@ -934,7 +934,7 @@ elif args.command == "util": qs_parser.add_argument("-i","--intrees", action="store", default="all") qs_parser.add_argument("-o", "--outfile", action="store") qs_parser.add_argument("-s", "--seed", action="store", default="random") - qs_parser.add_argument("-q", "--nquartets", action="store", default=None) + qs_parser.add_argument("-n", "--nquartets", action="store", default=None) qs_parser.add_argument("-t", "--threads", action="store", default="1") qs_parser.add_argument("-l", "--lineagefile", action="store") qs_parser.add_argument("-b", "--stopby", action="store", default=None) @@ -967,7 +967,7 @@ elif args.command == "util": elif qs_args.stopby: cmd += ["-b", qs_args.stopby] else: - print(now(), "You need to either specify a stopping criterion (-b) or the total number of quartets (-q).") + print(now(), "You need to either specify a stopping criterion (-b) or the total number of quartets (-n).") sys.exit(1) From c7629f9f71fce22f8f66dec0465446ef286599d4 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Wed, 1 Jun 2022 16:03:15 +0200 Subject: [PATCH 21/26] Update README with util information. --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 2e0a222..3ed137c 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ Commands: report Create a HTML report of the run check Quickly check status of the run + util Utilities for a posteriori analyses of trees -v, --version Print version -h, --help Display help @@ -226,6 +227,10 @@ $ ./phylociraptor speciestree -t sge -c data/cluster_config-SGE.yaml $ ./phylociraptor report ``` +## A posteriori analyses of trees calculated using phylociraptor + +Phylociraptor provides several utilities to investigate tree similarity and plot trees. Please refer to the [documentation](https://phylociraptor.readthedocs.io) for additional details. + ## Using Docker It is also possible to run phylociraptor inside a docker container. This container has singularity and conda installed so the only requirement is that Docker is properly set up. This is still an experimental feature and we have not tested this extensively. We still recommend using phylociraptor as it is described above, especially when you work on a HPC cluster. From 543da3e2510fdda377aeb03e54084b2d2224a9d2 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Thu, 2 Jun 2022 09:40:27 +0200 Subject: [PATCH 22/26] Add missing muscle_parameters in config files of test cases --- data/test_cases/arthropoda/config.yaml | 1 + data/test_cases/fungi/config.yaml | 1 + data/test_cases/minimal/config.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/data/test_cases/arthropoda/config.yaml b/data/test_cases/arthropoda/config.yaml index c1933c5..dfe9b13 100644 --- a/data/test_cases/arthropoda/config.yaml +++ b/data/test_cases/arthropoda/config.yaml @@ -37,6 +37,7 @@ alignment: threads: 8 mafft_parameters: "--quiet --auto" #in case no additional parameters should be used, empty quotes need to be specified. clustalo_parameters: + muscle_parameters: # possible trimming options: trimal, aliscore trimming: diff --git a/data/test_cases/fungi/config.yaml b/data/test_cases/fungi/config.yaml index 0d24fb0..96bcf76 100644 --- a/data/test_cases/fungi/config.yaml +++ b/data/test_cases/fungi/config.yaml @@ -37,6 +37,7 @@ alignment: threads: 8 mafft_parameters: "--quiet --auto" #in case no additional parameters should be used, empty quotes need to be specified. clustalo_parameters: + muscle_parameters: # possible trimming options: trimal, aliscore trimming: method: ["trimal", "aliscore"] diff --git a/data/test_cases/minimal/config.yaml b/data/test_cases/minimal/config.yaml index 1e14988..5bc6ca1 100644 --- a/data/test_cases/minimal/config.yaml +++ b/data/test_cases/minimal/config.yaml @@ -38,6 +38,7 @@ alignment: threads: 8 mafft_parameters: "--quiet --auto" #in case no additional parameters should be used, empty quotes need to be specified. clustalo_parameters: + muscle_parameters: # possible trimming options: trimal, aliscore trimming: method: ["trimal", "aliscore"] From 1f09208bda7d972bdfe591e31368994f5437ce08 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Thu, 2 Jun 2022 09:41:39 +0200 Subject: [PATCH 23/26] estimate_conflict.py: Code cleanup, add warning when nquartets exceeds total number of possible quartets. Adjust similarity calculation accordingly --- bin/estimate_conflict.py | 43 +++++++++------------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/bin/estimate_conflict.py b/bin/estimate_conflict.py index 5f29c9b..6bdf983 100755 --- a/bin/estimate_conflict.py +++ b/bin/estimate_conflict.py @@ -12,6 +12,7 @@ import argparse import glob import os +import math from rpy2.robjects.packages import importr from rpy2 import robjects as ro @@ -19,22 +20,7 @@ def reduce_trees(treelist, quat): sorted_trees = [] i = 1 for tree in treelist: - #try: - # tree = Tree(tree.retain_taxa_with_labels(quat).as_string(schema="newick")) - #tree2 = Tree(tree.as_string(schema="newick")) - #except dp.utility.error.SeedNodeDeletionException: - # print("SeedNodeDeletionException: Error trying to reduce the tree, will skip this quartet:") - # print("Tree:",treelist.index(tree)+1, quat) - # return 0 - #tree.rx[4] = ro.r("NULL") tiplabels = list(tree.rx2("tip.label")) - - # # debug code: - # if not any(x in tiplabels for x in quat): - # print(quat, tiplabels) - # else: - # print("Quartet tips contained in tree.") - tf_list = [] for tip in tiplabels: if tip in quat: @@ -45,9 +31,6 @@ def reduce_trees(treelist, quat): redtree = ape.drop_tip(tree, tipss) treestr2 = ape.write_tree(ape.drop_tip(tree, tipss)) treestr = treestr2[0].strip(";") - #treestr = tree.write(format=9).strip(";") - #treestr = tree.prune(quat) - #print(tree2.write(format=9)) if len(list(redtree.rx2("tip.label"))) != 4: missing = [tip for tip in quat if tip not in list(redtree.rx2("tip.label"))] print(quat, "is not properly represented in tree", i, "Missing tips:", missing) @@ -69,11 +52,9 @@ def reduce_trees(treelist, quat): second_element = 1 sorted_trees.append("-".join(sorted(sorted_tree))) i += 1 -# return [Tree(t.extract_tree_with_taxa_labels(quat).as_string(schema="newick")) for t in treelist] return sorted_trees def reformat_quat(quat): -# sorted(quat) return quat[0] + "," + quat[1] + "-" + quat[2] + "," + quat[3] def compare_trees(treelist, combinations): @@ -128,9 +109,6 @@ def compute_single_quat(tl,i, quat, combinations): tl2 = reduce_trees(tl, quat) if tl2 == 0: return 0 -# print(tl2) -# reformatted_quat = reformat_quat(quat) -# result = float(compare_trees(tl2, reformatted_quat)) result = compare_trees(tl2, combinations) return (tl2[0], result) @@ -160,7 +138,7 @@ def compute_single_quat(tl,i, quat, combinations): ncpus = int(args.threads) if args.intrees == "all": iqtree_trees = glob.glob("results/phylogeny-*/iqtree/*/concat.contree") - raxml_trees = glob.glob("results/phylogeny-*/raxml/*/concat.tre") + raxml_trees = glob.glob("results/phylogeny-*/raxml/*/raxmlng.raxml.support") astral_trees = glob.glob("results/phylogeny-*/astral/*/species_tree.tre") #all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) treenames = iqtree_trees +raxml_trees + astral_trees @@ -197,11 +175,6 @@ def compute_single_quat(tl,i, quat, combinations): sys.exit(0) else: taxon_list = [taxon for taxon in set(all_tips)] - #selected_tips = [tip for tree in tl for tip in list(tree.rx2("tip.label"))] - #taxon_list = [] - #for taxon in set(all_tips): #only use tips present in all trees to avoid spurious comparisons - # if all_tips.count(taxon) == len(tl): - # taxon_list += taxon random.shuffle(taxon_list) @@ -218,6 +191,11 @@ def compute_single_quat(tl,i, quat, combinations): if args.nquartets: # 1. by specifying a max number input_list=[] nquartets = int(args.nquartets) + total_quartets = math.factorial(len(taxon_list)) / (math.factorial(len(taxon_list) - 4) * math.factorial(4)) + if total_quartets < nquartets: + print("WARNING: The total number of possible quartets (",total_quartets,") is smaller than the specified nquartets.") + #nquartets = int(total_quartets) + print("No. of quartets to be calculated:", nquartets) for i in range(nquartets): quartet = next(random_combination(taxon_list, 4)) @@ -294,9 +272,6 @@ def compute_single_quat(tl,i, quat, combinations): results_dict[result[0]] = result[1] i += 1 df = pd.DataFrame.from_dict(results_dict) - #print(df) - #conflicts= df.loc[:, df.sum() < len(tl)] - #print(conflicts) print("Calculating sampling coverage of tips:") with open(outfile +".tip_sampling_coverage.tsv", "w") as f: for tip in taxon_list: @@ -304,10 +279,10 @@ def compute_single_quat(tl,i, quat, combinations): print("Saving results. Prefix:", outfile) df.transpose().to_csv(outfile + ".quartets.csv", index_label="quartet", quoting=False) - #print("No. of quartets contributing to conflicts:", len(conflicts.columns.values.tolist())) print("Calculating pairwise comparison of trees (% similarity based on quartetts)...") - df_comp = df.sum(axis=1).sort_index()/nquartets + dif = len(df.columns.to_list()) # get total number of actually computed quartets. This can be smaller as nquartets eg. when there are few tips! + df_comp = df.sum(axis=1).sort_index()/dif df_comp = df_comp.sort_index() combinations = df_comp.index.to_list() index_names = ["T"+ str(i+1) for i in range(0,len(tl))] From 25e5490134ce1669570a5e1299d925b00482e088 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Thu, 2 Jun 2022 09:42:41 +0200 Subject: [PATCH 24/26] plot-tree.R: Fix color generation for < 11 colors) --- bin/plot-tree.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/plot-tree.R b/bin/plot-tree.R index 279bd7f..6f7e59e 100644 --- a/bin/plot-tree.R +++ b/bin/plot-tree.R @@ -8,6 +8,7 @@ suppressMessages(library(patchwork)) suppressMessages(library(cowplot)) suppressMessages(library(ggplot2)) suppressMessages(library(reshape2)) +suppressMessages(library(RColorBrewer)) args <- commandArgs(trailingOnly=TRUE) @@ -349,7 +350,7 @@ if (runmode == "plot") { } else { if (length(na.omit(lineages[,level])) <= 11) { # prettier colors when there are not too many different models - cols <- brewer.pal(length(node_names), "BrBG") + cols <- brewer.pal(length(na.omit(lineages[,level])), "BrBG") } else { # we need more colors color <- grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)] From 8aa7a12f019d046bc778da5787300cc658bdcfee Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Thu, 2 Jun 2022 09:43:28 +0200 Subject: [PATCH 25/26] phylociraptor: Fix raxml tree path searching for util --- phylociraptor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phylociraptor b/phylociraptor index 612bc71..66a0856 100755 --- a/phylociraptor +++ b/phylociraptor @@ -953,7 +953,7 @@ elif args.command == "util": elif qs_args.intrees == "all": print(now(), "Will compare all trees, this expects phylociraptor mltree and/or speciestree to be finished.") iqtree_trees = glob.glob("results/phylogeny-*/iqtree/*/concat.contree") - raxml_trees = glob.glob("results/phylogeny-*/raxml/*/concat.tre") + raxml_trees = glob.glob("results/phylogeny-*/raxml/*/raxmlng.raxml.support") astral_trees = glob.glob("results/phylogeny-*/astral/*/species_tree.tre") all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) else: @@ -994,7 +994,7 @@ elif args.command == "util": elif qs_args.intrees == "all": print(now(), "Will compare all trees, this expects phylociraptor mltree and/or speciestree to be finished.") iqtree_trees = glob.glob("results/phylogeny-*/iqtree/*/concat.contree") - raxml_trees = glob.glob("results/phylogeny-*/raxml/*/concat.tre") + raxml_trees = glob.glob("results/phylogeny-*/raxml/*/raxmlng.raxml.support") astral_trees = glob.glob("results/phylogeny-*/astral/*/species_tree.tre") all_trees = ",".join(iqtree_trees + raxml_trees + astral_trees) else: From 0017e9e62a72e8f612ae65957dd991daec05a1e7 Mon Sep 17 00:00:00 2001 From: Philipp Resl Date: Tue, 7 Jun 2022 10:53:19 +0200 Subject: [PATCH 26/26] Bump version to 0.9.7 --- .version | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.version b/.version index 85b7c69..c81aa44 100644 --- a/.version +++ b/.version @@ -1 +1 @@ -0.9.6 +0.9.7 diff --git a/README.md b/README.md index 3ed137c..d5ac04e 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ $ ./phylociraptor / .___/_/ /_/\__, /_/\____/\___/_/_/ \__,_/ .___/\__/\____/_/ /_/ /____/ /_/ - the rapid phylogenomic tree calculator, ver.0.9.0 + the rapid phylogenomic tree calculator, ver.0.9.7 Usage: phylociraptor