From b51959d1c8de985f5ce52cf66cc56eb44622498b Mon Sep 17 00:00:00 2001 From: kopardev Date: Mon, 15 Mar 2021 15:33:16 -0400 Subject: [PATCH] Complying with new snakemake folder structure --- README.md | 55 +--- config/config.yaml | 2 +- config/tools.yaml | 2 + ...mpliconSeq.bash => run_circRNA_AmpliconSeq | 2 +- .../Snakefile | 3 + workflow/envs/dummy_env.yaml | 0 workflow/rules/align.smk | 90 ++++++ workflow/rules/init.smk | 96 +++++++ workflow/rules/quant.smk | 53 ++++ workflow/rules/trim.smk | 51 ++++ workflow/scripts/build_primer_bed.py | 35 +++ workflow/scripts/gather_cluster_stats.sh | 67 +++++ .../generate_bsj_fasta_from_primer_bed.py | 106 +++++++ .../get_donor_acceptor_from_salmon_quant.py | 34 +++ workflow/scripts/pie_plots.R | 95 +++++++ workflow/scripts/run_wrapper.bash | 268 ++++++++++++++++++ 16 files changed, 916 insertions(+), 43 deletions(-) rename run_circRNA_AmpliconSeq.bash => run_circRNA_AmpliconSeq (99%) mode change 100644 => 100755 rename circRNA_ampliconSeq.snakefile => workflow/Snakefile (91%) create mode 100644 workflow/envs/dummy_env.yaml create mode 100644 workflow/rules/align.smk create mode 100644 workflow/rules/init.smk create mode 100644 workflow/rules/quant.smk create mode 100644 workflow/rules/trim.smk create mode 100644 workflow/scripts/build_primer_bed.py create mode 100644 workflow/scripts/gather_cluster_stats.sh create mode 100644 workflow/scripts/generate_bsj_fasta_from_primer_bed.py create mode 100644 workflow/scripts/get_donor_acceptor_from_salmon_quant.py create mode 100644 workflow/scripts/pie_plots.R create mode 100644 workflow/scripts/run_wrapper.bash diff --git a/README.md b/README.md index 04f0b58..33948c1 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,16 @@ -# CCBR Snakemake Pipeline Cookiecutter -This is a dummy folder framework for CCBR snakemake workflows. -New workflows can be started using this repository as a template. +# CCBR circRNA AmpliconSeq Snakemake Workflow +This is a snakemake workflow to process circRNA AmpliconSeq datasets generated to study circRNAs in KSHV and human hosts using divergent primers. Some basic usage instructions are as follows: -## Creating PAT for GH -This is a prerequisite for the next step. You will need [gh cli](https://cli.github.com/) installed on your laptop or use `/data/CCBR_Pipeliner/db/PipeDB/bin/gh_1.7.0_linux_amd64/bin/gh` on biowulf. Skip if can access github in an automated way already. - -Personal Access Token (PAT) is required to access GitHub (GH) without having to authenticate by other means (like password) every single time. You can create a PAT by going [here](https://github.com/settings/tokens). Then you can copy the PAT and save it into a file on biowulf (say `~/gh_token`). Next, you can run the following command to set everything up correctly on biowulf (or your laptop) -``` -gh auth login --with-token < ~/git_token -``` - -## Creating new repository -You can use [gh cli](https://cli.github.com/) to - * create a new repository under CCBR, and - * copy over the template code from CCBR_SnakemakePipelineCookiecutter -with the following command -``` -gh repo create CCBR/ \ ---description "" \ ---public \ ---template CCBR/CCBR_SnakemakePipelineCookiecutter \ ---confirm -``` -On biowulf, you may have to specify the full path of the `gh` executable is located here: `/data/CCBR_Pipeliner/db/PipeDB/bin/gh_1.7.0_linux_amd64/bin/gh` - -Then you can clone a local copy of the new repository: ``` -git clone https://github.com/CCBR/.git -``` - -If you drop the `CCBR/` from the `gh` command above, then the new repo is created under your username. The commands would then look like this: -``` -gh repo create \ ---description "" \ ---public \ ---template CCBR/CCBR_SnakemakePipelineCookiecutter \ ---confirm - -git clone https://github.com//.git -``` - -You can change `--public` to `--private` in the above `gh` command to make the newly created repository private. +USAGE: + bash ./run_circRNA_AmpliconSeq -m/--runmode= -w/--workdir= +Required Arguments: +1. RUNMODE: [Type: String] Valid options: + *) init : initialize workdir + *) run : run with slurm + *) reset : DELETE workdir dir and re-init it + *) dryrun : dry run snakemake to generate DAG + *) unlock : unlock workdir if locked by snakemake + *) runlocal : run without submitting to sbatch +2. WORKDIR: [Type: String]: Absolute or relative path to the output folder with write permissions. +``` \ No newline at end of file diff --git a/config/config.yaml b/config/config.yaml index 866c093..bd2cb79 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -28,7 +28,7 @@ aggregate_quant_rowsum_filter: 50 # aggregate quant.sf across samples and fi # # ## you most probably dont need to change these -scriptsdir: "PIPELINE_HOME/scripts" +scriptsdir: "PIPELINE_HOME/workflow/scripts" resourcesdir: "PIPELINE_HOME/resources" tools: "PIPELINE_HOME/config/tools.yaml" cluster: "PIPELINE_HOME/config/cluster.json" diff --git a/config/tools.yaml b/config/tools.yaml index ca27fdb..4cb565b 100644 --- a/config/tools.yaml +++ b/config/tools.yaml @@ -16,6 +16,8 @@ salmon: version: "salmon/1.4.0" sambamba: version: "sambamba/0.8.0" +R: + version: "R/4.0.3" # samtools: # version: "samtools/1.11" # star: diff --git a/run_circRNA_AmpliconSeq.bash b/run_circRNA_AmpliconSeq old mode 100644 new mode 100755 similarity index 99% rename from run_circRNA_AmpliconSeq.bash rename to run_circRNA_AmpliconSeq index 1994d61..6274795 --- a/run_circRNA_AmpliconSeq.bash +++ b/run_circRNA_AmpliconSeq @@ -28,7 +28,7 @@ function get_git_commitid_tag() { PIPELINE_HOME=$(readlink -f $(dirname "$0")) echo "Pipeline Dir: $PIPELINE_HOME" # set snakefile -SNAKEFILE="${PIPELINE_HOME}/circRNA_ampliconSeq.snakefile" +SNAKEFILE="${PIPELINE_HOME}/workflow/Snakefile" # get github commit tag GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME) diff --git a/circRNA_ampliconSeq.snakefile b/workflow/Snakefile similarity index 91% rename from circRNA_ampliconSeq.snakefile rename to workflow/Snakefile index 6537c2e..f99ab8b 100644 --- a/circRNA_ampliconSeq.snakefile +++ b/workflow/Snakefile @@ -1,3 +1,6 @@ +from snakemake.utils import min_version +min_version("5.24.1") + from os.path import join include: join("rules/init.smk") diff --git a/workflow/envs/dummy_env.yaml b/workflow/envs/dummy_env.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk new file mode 100644 index 0000000..70e09e1 --- /dev/null +++ b/workflow/rules/align.smk @@ -0,0 +1,90 @@ +# if config['bsjfa']: +# BSJFA=config['bsjfa'] +# else: +# BSJFA=config['bsjfadefault'] +# check_readaccess(BSJFA) +PRIMERINFO=config['primertsv'] +check_readaccess(PRIMERINFO) +REFFA=config['reffa'] +check_readaccess(REFFA) + +rule create_primerbed: + input: + primertsv=PRIMERINFO + output: + primerbed=join(WORKDIR,"bowtie_index","primers.bed") + params: + script=join(SCRIPTSDIR,"build_primer_bed.py") + envmodules: TOOLS["python37"]["version"] + shell:""" +python {params.script} {input.primertsv} > {output.primerbed} +""" + +rule create_bsjfa: + input: + bed=rules.create_primerbed.output.primerbed, + reffa=REFFA + output: + bsjfa=join(WORKDIR,"bowtie_index","bsj.fa") + params: + script=join(SCRIPTSDIR,"generate_bsj_fasta_from_primer_bed.py") + container: "docker://nciccbr/ccbr_htseq_0.13.5:latest" + shell:""" +python {params.script} \ +--bed {input.bed} \ +--reffa {input.reffa} \ +--outfa {output.bsjfa} +""" + +rule build_index: + input: + bsjfa=rules.create_bsjfa.output.bsjfa, + output: + join(WORKDIR,"bowtie_index","bsj.1.bt2") + params: + outdir=join(WORKDIR,"bowtie_index") + envmodules: TOOLS["bowtie2"]["version"] + threads: 2 + shell: """ +# cp {input.bsjfa} {params.outdir} +cd {params.outdir} +bsjfa=$(basename {input.bsjfa}) +bowtie2-build $bsjfa bsj +# gzip -n -f {input.bsjfa} +""" + +rule align: + input: + r1=rules.cutadapt.output.of1, + r2=rules.cutadapt.output.of2, + bt2_index=rules.build_index.output, + output: + bam=join(WORKDIR,"results","{sample}","bam","{sample}.bam") + params: + sample="{sample}", + workdir=WORKDIR, + outdir=join(WORKDIR,"results","{sample}","bowtie2"), + peorse=get_peorse, + bowtie2_alignment_parameters=config["bowtie2_alignment_parameters"], + mem=MEMORY + threads: 56 + envmodules: TOOLS["bowtie2"]["version"], TOOLS["sambamba"]["version"] + shell:""" +bt2_index=$(echo "{input.bt2_index}"|awk -F".1.bt2" '{{print $1}}') +if [ "{params.peorse}" == "PE" ];then + bowtie2 \ + --threads {threads} \ + {params.bowtie2_alignment_parameters} \ + -x $bt2_index \ + -U {input.r1},{input.r2} +else + bowtie2 \ + --threads {threads} \ + {params.bowtie2_alignment_parameters} \ + -x $bt2_index \ + -U {input.r1} +fi \ +| awk -F"\\t" '{{if ($6 ~ /60M/ || $1 ~ /^@/){{print}}}}' \ +| sambamba view --nthreads={threads} -S --format=bam -o=/dev/stdout /dev/stdin \ +| sambamba sort --memory-limit={params.mem}G --tmpdir=/dev/shm --nthreads={threads} --out={output.bam} /dev/stdin +""" diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk new file mode 100644 index 0000000..fb87a25 --- /dev/null +++ b/workflow/rules/init.smk @@ -0,0 +1,96 @@ +import shutil +import sys +import os +import pandas as pd +import yaml +import glob + +CONFIGFILE = str(workflow.overwrite_configfiles[0]) + +def check_existence(filename): + """Checks if file exists on filesystem + :param filename : Name of file to check + """ + filename=filename.strip() + if not os.path.exists(filename): + sys.exit("File: {} does not exists!".format(filename)) + return True + + +def check_readaccess(filename): + """Checks permissions to see if user can read a file + :param filename : Name of file to check + """ + filename=filename.strip() + check_existence(filename) + if not os.access(filename,os.R_OK): + sys.exit("File: {} exists, but user cannot read from file due to permissions!".format(filename)) + return True + + +def check_writeaccess(filename): + """Checks permissions to see if user can write to a file + :param filename : Name of file to check + """ + filename=filename.strip() + check_existence(filename) + if not os.access(filename,os.W_OK): + sys.exit("File: {} exists, but user cannot write to file due to permissions!".format(filename)) + return True + + +# +MEMORY="100" +# get working dir from config +WORKDIR = config["workdir"] + +# get resources folder +try: + RESOURCESDIR = config["resourcesdir"] +except KeyError: + RESOURCESDIR = join(WORKDIR,"resources") + +# get scripts folder +try: + SCRIPTSDIR = config["scriptsdir"] +except KeyError: + SCRIPTSDIR = join(WORKDIR,"scripts") + +## Load tools from YAML file +with open(config["tools"]) as f: + TOOLS = yaml.safe_load(f) + +if not os.path.exists(join(WORKDIR,"fastqs")): + os.mkdir(join(WORKDIR,"fastqs")) +if not os.path.exists(join(WORKDIR,"results")): + os.mkdir(join(WORKDIR,"results")) +for f in ["samples", "tools", "cluster"]: + check_readaccess(config[f]) + +SAMPLESDF = pd.read_csv(config["samples"],sep="\t",header=0,index_col="sampleName") +SAMPLES = list(SAMPLESDF.index) +SAMPLESDF["R1"]=join(RESOURCESDIR,"dummy") +SAMPLESDF["R2"]=join(RESOURCESDIR,"dummy") +SAMPLESDF["PEorSE"]="PE" + +for sample in SAMPLES: + R1file=SAMPLESDF["path_to_R1_fastq"][sample] + R2file=SAMPLESDF["path_to_R2_fastq"][sample] + # print(sample,R1file,R2file) + check_readaccess(R1file) + R1filenewname=join(WORKDIR,"fastqs",sample+".R1.fastq.gz") + if not os.path.exists(R1filenewname): + os.symlink(R1file,R1filenewname) + SAMPLESDF.loc[[sample],"R1"]=R1filenewname + if str(R2file)!='nan': + check_readaccess(R2file) + R2filenewname=join(WORKDIR,"fastqs",sample+".R2.fastq.gz") + if not os.path.exists(R2filenewname): + os.symlink(R2file,R2filenewname) + SAMPLESDF.loc[[sample],"R2"]=R2filenewname + else: + SAMPLESDF.loc[[sample],"PEorSE"]="SE" +# print(SAMPLESDF) +# sys.exit() + + diff --git a/workflow/rules/quant.smk b/workflow/rules/quant.smk new file mode 100644 index 0000000..0e85878 --- /dev/null +++ b/workflow/rules/quant.smk @@ -0,0 +1,53 @@ +BSJFA=rules.create_bsjfa.output.bsjfa + +rule salmon: + input: + bam=rules.align.output.bam, + reffa=BSJFA, + output: + sf=join(WORKDIR,"results","{sample}","salmon","quant.sf") + params: + salmon_parameters=config["salmon_parameters"] + envmodules: TOOLS["salmon"]["version"] + shell:""" +outdir=$(dirname {output.sf}) +salmon quant \ +--threads {threads} \ +{params.salmon_parameters} \ +-t {input.reffa} \ +-a {input.bam} \ +--output $outdir +""" + +rule aggregate_quant: + input: + expand(join(WORKDIR,"results","{sample}","salmon","quant.sf"),sample=SAMPLES) + output: + mergedquant=join(WORKDIR,"results","mergedquant.tsv"), + filteredmergedquant=join(WORKDIR,"results","mergedquant.filtered.tsv") + params: + rowsumfilter=config['aggregate_quant_rowsum_filter'], + plotsdir=join(WORKDIR,"results","pieplots"), + rscript=join(SCRIPTSDIR,"pie_plots.R") + envmodules: TOOLS["salmon"]["version"],TOOLS["R"]["version"] + shell:""" +names="" +quants="" +for i in {input};do + outdir=$(dirname $i) + quants="$quants $outdir" + samplename=$(echo $outdir|awk -F"/" '{{print $(NF-1)}}') + names="$names $samplename" +done +salmon quantmerge --quants $quants --names $names --column numreads -o {output.mergedquant} +head -n1 {output.mergedquant} > {output.filteredmergedquant} +tail -n +2 {output.mergedquant} |\ +awk -F"\\t" -v r={params.rowsumfilter} -v OFS="\\t" '{{for(i=2;i<=NF;i++){{sum[NR]=sum[NR]+$i}};if (sum[NR] >= r) {{print sum[NR],$0}}}}' |\ +sort -k1,1gr |\ +cut -f2- >> {output.filteredmergedquant} +if [ -d {params.plotsdir} ];then rm -rf {params.plotsdir};fi && mkdir {params.plotsdir} && cd {params.plotsdir} && cp {output.filteredmergedquant} . && \ +Rscript {params.rscript} $(basename {output.filteredmergedquant}) && rm -f $(basename {output.filteredmergedquant}) +""" + + + \ No newline at end of file diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk new file mode 100644 index 0000000..9fe2ff5 --- /dev/null +++ b/workflow/rules/trim.smk @@ -0,0 +1,51 @@ +def get_fastqs(wildcards): + d=dict() + d["R1"]=SAMPLESDF["R1"][wildcards.sample] + d["R2"]=SAMPLESDF["R2"][wildcards.sample] + return d + +def get_peorse(wildcards): + return SAMPLESDF["PEorSE"][wildcards.sample] + +rule cutadapt: + input: + unpack(get_fastqs) + output: + of1=join(WORKDIR,"results","{sample}","trim","{sample}.R1.trim.fastq.gz"), + of2=join(WORKDIR,"results","{sample}","trim","{sample}.R2.trim.fastq.gz") + params: + sample="{sample}", + workdir=WORKDIR, + outdir=join(WORKDIR,"results","{sample}"), + peorse=get_peorse, + adapters=join(RESOURCESDIR,"TruSeq_and_nextera_adapters.consolidated.fa") + envmodules: TOOLS["cutadapt"]["version"] + threads: 56 + shell:""" +if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi +if [ "{params.peorse}" == "PE" ];then + ## Paired-end + cutadapt --pair-filter=any \ + --nextseq-trim=2 \ + --trim-n \ + -n 5 -O 5 \ + -q 10,10 -m 35:35 \ + -b file:{params.adapters} \ + -B file:{params.adapters} \ + -j {threads} \ + -o {output.of1} -p {output.of2} \ + {input.R1} {input.R2} +else + ## Single-end + cutadapt \ + --nextseq-trim=2 \ + --trim-n \ + -n 5 -O 5 \ + -q 10,10 -m 35 \ + -b file:{params.adapters} \ + -j {threads} \ + -o {output.of1} \ + {input.R1} + touch {output.of2} +fi +""" \ No newline at end of file diff --git a/workflow/scripts/build_primer_bed.py b/workflow/scripts/build_primer_bed.py new file mode 100644 index 0000000..bebc7f2 --- /dev/null +++ b/workflow/scripts/build_primer_bed.py @@ -0,0 +1,35 @@ +import sys +primerstsv=open(sys.argv[1]).readlines() +primerstsv.pop(0) +primers=dict() +for i in primerstsv: + i=i.strip().split("\t") + circRNAnamesplit=i[0].split("_") + circRNAname="_".join(circRNAnamesplit[:-1]) + if not circRNAname in primers: + primers[circRNAname]=dict() + primers[circRNAname]["coordinates"]=list() + ForR=circRNAnamesplit[-1][0] + primers[circRNAname]["chrom"]=i[1] + strand=i[4] + primers[circRNAname]["coordinates"].append(int(i[2])) + primers[circRNAname]["coordinates"].append(int(i[3])) + primers[circRNAname]["coordinates"].sort() + if ForR == "F": + primers[circRNAname]["strand"]=strand + primers[circRNAname]["ftype"]=circRNAnamesplit[-1] + if strand == "+": + primers[circRNAname]["ASorSS"]="SS" + elif strand == "-": + primers[circRNAname]["ASorSS"]="AS" + elif ForR == "R": + primers[circRNAname]["rtype"]=circRNAnamesplit[-1] +for primer in primers: + sname=primer+"_"+primers[primer]["ftype"]+primers[primer]["rtype"]+"##"+primers[primer]["ASorSS"] + #sname="_".join([primer,primers[primer]["ASorSS"]) + #print(sname) + chrom=primers[primer]["chrom"] + start=str(primers[primer]["coordinates"][0]) + end=str(primers[primer]["coordinates"][-1]) + strand=primers[primer]["strand"] + print("\t".join([chrom,start,end,sname,".",strand])) diff --git a/workflow/scripts/gather_cluster_stats.sh b/workflow/scripts/gather_cluster_stats.sh new file mode 100644 index 0000000..49c326e --- /dev/null +++ b/workflow/scripts/gather_cluster_stats.sh @@ -0,0 +1,67 @@ +#!/bin/bash +## AUTHOR : Vishal N. Koparde, Ph.D., CCBR, NCI +## DATE : Feb 2021 +## This scripts gathers cluster related statistics for jobs run on Biowulf using Snakemake by: +## > extracting "external" jobids from snakemake.log files +## > gather cluster stats for each job using "jobdata" and "jobhist" commands +## > sorts output by job submission time +## > output TSV file +## + +function node2runpartition { + node=$1 + partitions_requested=$2 + run_partition=$(for p in `echo $partitions_requested|awk '{print $NF}'|tr "," " "`;do if [ "$(freen -N $node|grep $p|awk '{print $1}'|grep $p|wc -l)" == "1" ]; then echo $p;break 1;fi;done) + if [ "$run_partition" == "" ];then + echo "unknown" + else + echo "$run_partition" + fi +} + + +function get_jobid_stats { +jobid=$1 +declare -A jobdataarray +notaccountablejob=$(jobhist $jid|grep "No accounting"|wc -l) +if [ "$notaccountablejob" == "1" ];then + jobdataarray["submit_time"]="JOBNOTACCOUNTABLE" + jobdataarray["jobid"]="$jobid" +else + jobdata $jobid > ${jobid}.tmp + awk -F"\t" '{if (NF==2) {print}}' ${jobid}.tmp > ${jobid}.data && rm -f ${jobid}.tmp + while read a b;do + jobdataarray["$a"]="$b" + done < ${jobid}.data + rm -f ${jobid}.data + st=${jobdataarray["submit_time"]} + jobdataarray["human_submit_time"]=$(date -d @$st|sed "s/ /_/g") + jobdataarray["alloc_node_partition"]=$(node2runpartition ${jobdataarray["alloc_node"]} ${jobdataarray["partition"]}) + jobdataarray["run_node_partition"]=$(node2runpartition ${jobdataarray["node_list"]} ${jobdataarray["partition"]}) +fi +echo -ne "${jobdataarray["submit_time"]}\t" +echo -ne "${jobdataarray["human_submit_time"]}\t" +echo -ne "${jobdataarray["jobid"]}:${jobdataarray["state"]}:${jobdataarray["job_name"]}\t" +echo -ne "${jobdataarray["alloc_node"]}:${jobdataarray["alloc_node_partition"]}:${jobdataarray["node_list"]}:${jobdataarray["run_node_partition"]}\t" +echo -ne "${jobdataarray["queued"]}:${jobdataarray["elapsed"]}:${jobdataarray["time_limit"]}\t" +echo -ne "${jobdataarray["avg_cpus"]}:${jobdataarray["max_cpu_used"]}:${jobdataarray["cpus_per_task"]}\t" +echo -ne "${jobdataarray["avg_mem"]}:${jobdataarray["max_mem_used"]}:${jobdataarray["total_mem"]}\t" +echo -ne "${jobdataarray["partition"]}:${jobdataarray["qos"]}\t" +echo -ne "${jobdataarray["username"]}:${jobdataarray["groupname"]}:${jobdataarray["account"]}\t" +echo -ne "${jobdataarray["work_dir"]}\t" +echo -ne "${jobdataarray["std_out"]}\t" +echo -ne "${jobdataarray["std_err"]}\n" +} + +if [ "$#" != "1" ];then + echo " bash $0 " + exit 1 +fi + +snakemakelogfile=$1 +grep "with external jobid" $snakemakelogfile | awk '{print $NF}' | sed "s/['.]//g" | sort | uniq > ${snakemakelogfile}.jobids.lst +echo -ne "##SubmitTime\tHumanSubmitTime\tJobID:JobState:JobName\tAllocNode:AllocNodePartition:RunNode:RunNodePartition\tQueueTime:RunTime:TimeLimit\tAvgCPU:MaxCPU:CPULimit\tAvgMEM:MaxMEM:MEMLimit\tPartition:QOS\tUsername:Group:Account\tWorkdir\tStdOut\tStdErr\n" +while read jid;do + get_jobid_stats $jid +done < ${snakemakelogfile}.jobids.lst |sort -k1,1n +rm -f ${snakemakelogfile}.jobids.lst \ No newline at end of file diff --git a/workflow/scripts/generate_bsj_fasta_from_primer_bed.py b/workflow/scripts/generate_bsj_fasta_from_primer_bed.py new file mode 100644 index 0000000..f164634 --- /dev/null +++ b/workflow/scripts/generate_bsj_fasta_from_primer_bed.py @@ -0,0 +1,106 @@ +import HTSeq +import sys +import argparse +import os + +def read_bed_file(filename): + bedfile=open(filename,'r') + primers=dict() + for f in bedfile.readlines(): + f=f.strip().split("\t") + primer=f[3] + primers[primer]=dict() + primers[primer]["chrom"]=f[0] + primers[primer]["start"]=int(f[1]) + primers[primer]["end"]=int(f[2]) + primers[primer]["strand"]=f[5] + return primers + +def complement(c): + if c=="A": + return "T" + elif c=="C": + return "G" + elif c=="G": + return "C" + elif c=="T": + return "A" + elif c=="N": + return "N" + else: + print("Unknown char %s. expecting A,C,G,T or N"%(c)) + exit() + +def revcom(seq): + rc=seq[::-1].upper() + rc="".join(list(map(lambda x:complement(x),rc))) + return rc + +def convertnt(c): + if c=="A": + return "1" + elif c=="C": + return "4" + elif c=="G": + return "3" + elif c=="T": + return "2" + elif c=="N": + return "0" + else: + return "-1" + +parser = argparse.ArgumentParser(description='Create fasta file from divergent primers bed file') +parser.add_argument('--bed', dest='primerbed', type=str, required=True, + help='Divergent primers bed file with min. 4 columns (chr,start,end,name), name is expected to have _AS/_SS suffix') +parser.add_argument('--reffa', dest='reffa', type=str, required=True, + help='reference fasta') +parser.add_argument('--outfa', dest='outfa', type=str, required=True, + help='output fasta') +parser.add_argument('--scanlength', dest='scanlen', type=int, required=False, default=200, + help='scan length...even positive number(default 200)') +parser.add_argument('--flankmax', dest='flankmax', type=int, required=False, default=30, + help='flankmax (default 30)') +args = parser.parse_args() +# sequences = dict( (s.name, s) for s in HTSeq.FastaReader(args.reffa) ) +sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa, raw_iterator=True) ) +# sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa) ) +primers = read_bed_file(args.primerbed) +# print(primers) +# for primer in primers: +# print(primers[primer]["chrom"]) +outfastafile = open( args.outfa, "w" ) +offset = "N"*300 +for primer in primers: + ASorSS=primer.split("##")[-1] + seq=sequences[primers[primer]["chrom"]] + for i in range(primers[primer]["start"]-args.scanlen,primers[primer]["start"],1): + for j in range(primers[primer]["end"],primers[primer]["end"]+args.scanlen,1): + k=int((j-i)/2) + if k >= args.flankmax: + bsjflanka=seq[ j - args.flankmax : j ] + bsjflankb=seq[ i : i + args.flankmax ] + else: + bsjflanka=seq[ j-k : j ] + bsjflankb=seq[ i : i+k ] + if ASorSS == "SS": + five_da=bsjflanka[-2:]+"-"+seq[j:j+2] + three_da=seq[i-2:i]+"-"+bsjflankb[:2] + da=seq[j:j+2]+"-"+seq[i-2:i] + elif ASorSS == "AS": + five_da=revcom(seq[i-2:i])+"-"+revcom(bsjflankb[:2]) + three_da=revcom(bsjflanka[-2:])+"-"+revcom(seq[j:j+2]) + da=revcom(seq[i-2:i])+"-"+revcom(seq[j:j+2]) + else: + print("Primer %s does not have AS or SS suffix"%(primer)) + exit() + bsjflank=bsjflanka+bsjflankb + bsjflank=bsjflank.upper() + bsjflank_nt="".join(list(map(lambda x:convertnt(x),bsjflank))) + sname = "##".join([primer,primers[primer]["chrom"],str(i),str(j),five_da,three_da,da,bsjflank_nt]) + # print(sname) + # print(bsjflank) + myseq = HTSeq.Sequence( bytes(offset+bsjflank+offset, 'utf-8'), sname) + myseq.write_to_fasta_file(outfastafile) +outfastafile.close() + diff --git a/workflow/scripts/get_donor_acceptor_from_salmon_quant.py b/workflow/scripts/get_donor_acceptor_from_salmon_quant.py new file mode 100644 index 0000000..7640899 --- /dev/null +++ b/workflow/scripts/get_donor_acceptor_from_salmon_quant.py @@ -0,0 +1,34 @@ +import HTSeq +import sys +import argparse +import os + +def read_bed_file(filename): + bedfile=open(filename,'r') + primers=dict() + for f in bedfile.readlines(): + f=f.strip().split("\t") + primer=f[3] + primers[primer]=dict() + primers[primer]["chrom"]=f[0] + primers[primer]["start"]=int(f[1]) + primers[primer]["end"]=int(f[2]) + primers[primer]["strand"]=f[5] + return primers + +parser = argparse.ArgumentParser(description='Create fasta file from divergent primers bed file') +parser.add_argument('--bed', dest='primerbed', type=str, required=True, + help='Divergent primers bed file with min. 4 columns (chr,start,end,name) name is expected to have _AS/_SS suffix') +parser.add_argument('--reffa', dest='reffa', type=str, required=True, + help='reference fasta') +parser.add_argument('--outfa', dest='outfa', type=str, required=True, + help='output fasta') +parser.add_argument('--scanlength', dest='scanlen', type=int, required=False, default=200, + help='scan length...even positive number(default 200)') +parser.add_argument('--flankmax', dest='flankmax', type=int, required=False, default=30, + help='flankmax (default 30)') +args = parser.parse_args() +# sequences = dict( (s.name, s) for s in HTSeq.FastaReader(args.reffa) ) +sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa, raw_iterator=True) ) +# sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa) ) +primers = read_bed_file(args.primerbed) \ No newline at end of file diff --git a/workflow/scripts/pie_plots.R b/workflow/scripts/pie_plots.R new file mode 100644 index 0000000..2f32303 --- /dev/null +++ b/workflow/scripts/pie_plots.R @@ -0,0 +1,95 @@ + +rm(list=ls()) +library("tidyverse") +library("RColorBrewer") +n <- 101 +qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] +col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) + +read_data_table <-function(fname,circRNA){ + d=read.csv(fname,sep="\t",header=TRUE) + samplelist=colnames(d)[2:length(colnames(d))] + d %>% separate(col="Name",into=c("circRNA","ASorSS","start","end","fiveDA","threeDA","DA","NT"),sep="##") -> d + d$fiveDA=NULL + d$threeDA=NULL + x=list() + x[["table"]]=d + x[["samplelist"]]=samplelist + x[["circRNAlist"]]=unique(d$circRNA) + return(x) +} +filter_table_by_cname <-function(d,circRNA){ + k=d$circRNA==circRNA + d=d[k,] + return(d) +} +filter_table_by_sname <-function(d,samplename){ + d=data.frame(DA=d$DA,counts=d[[samplename]]) + d=d[order(d$counts,decreasing = TRUE),] + d=d[!(d$counts==0),] + return(d) +} +add_perc_sign <- function(s){ + result="" + if (as.character(s)!=""){ + result=paste0(as.character(s),"%") + } + return(result) +} + +create_perc_table<-function(df){ + table_percent <- df %>% mutate(DA=DA,perc = round((counts/ sum(counts)) * 100, 1)) %>% + mutate(DA=DA,labels=perc,y_text=cumsum(perc)-perc/2) + nr=nrow(table_percent) + if (nr>4){ + table_percent[5:nrow(table_percent),][["labels"]]="" + } + # min_labels=min(5,nrow(table_percent)) + # if (sum(!table_percent$labels<20)0){ + # table_percent[table_percent$labels<20,][["labels"]]="" + # } + # } + table_percent[table_percent$labels!="",]$labels=paste(table_percent[table_percent$labels!="",]$DA,table_percent[table_percent$labels!="",]$labels) + table_percent$labels=lapply(table_percent$labels,add_perc_sign) + table_percent[table_percent$labels!="",]$labels=paste0(table_percent[table_percent$labels!="",]$labels,"(",table_percent[table_percent$labels!="",]$counts,")") + return(table_percent) +} + + +args = commandArgs(trailingOnly=TRUE) +args[1] = "mergedquant.filtered.tsv" +d=read_data_table(args[1]) +samplelist=d$samplelist +circRNAlist=d$circRNAlist +d=d$table + +for(i in 1:length(circRNAlist)) { + cname=circRNAlist[[i]][1] + for (j in 1:length(samplelist)){ + sname=samplelist[[j]][1] + df=filter_table_by_sname(filter_table_by_cname(d,cname),sname) + if(nrow(df)>0){ + df=create_perc_table(df) + fname=paste0(cname,"-",sname,".png") + # print(fname) + png(fname) + # print(ggplot(df, aes(x = "", y = perc, fill = DA)) + + # geom_bar(width = 1,stat = "identity") + + # geom_label_repel(aes(label = labels, y = y_text)) + + # scale_fill_manual(values=sample(col_vector,100,replace=TRUE))+ + # coord_polar(theta = "y", start = 0) + + # theme_light()+ + # theme(legend.position="none")+ + # labs(x = "", y = "", title = paste(cname,sname,sep="-"))) + pie(df$perc,labels = df$labels,main= paste0(cname,"-",sname)) + dev.off() + } + } +} + diff --git a/workflow/scripts/run_wrapper.bash b/workflow/scripts/run_wrapper.bash new file mode 100644 index 0000000..155e223 --- /dev/null +++ b/workflow/scripts/run_wrapper.bash @@ -0,0 +1,268 @@ +#!/usr/bin/env bash +# Author: Vishal Koparde, Ph.D. +# CCBR, NCI +# (c) 2021 +# +# wrapper script to run the TOBIAS snakemake workflow +# run tobias +# https://github.com/loosolab/tobias/ +# ## clone the pipeline to a folder +# ## git clone https://github.com/loosolab/TOBIAS.git + +set -eo pipefail +module purge + +SINGULARITY_BINDS="-B ${PIPELINE_HOME}:${PIPELINE_HOME} -B ${WORKDIR}:${WORKDIR}" + +function get_git_commitid_tag() { + cd $1 + gid=$(git rev-parse HEAD) + tag=$(git describe --tags $gid 2>/dev/null) + echo -ne "$gid\t$tag" +} + +# ## setting PIPELINE_HOME +PIPELINE_HOME=$(readlink -f $(dirname "$0")) +echo "Pipeline Dir: $PIPELINE_HOME" +SNAKEFILE="${PIPELINE_HOME}/Snakefile" +# get github commit tag +GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME) +echo "Git Commit/Tag: $GIT_COMMIT_TAG" + +function usage() { cat << EOF +run_tobias.sh: run TOBIAS for ATAC seq data +USAGE: + bash run_tobias.sh +Required Positional Argument: + MODE: [Type: Str] Valid options: + a) init : initialize workdir + b) run : run with slurm + c) reset : DELETE workdir dir and re-init it + e) dryrun : dry run snakemake to generate DAG + f) unlock : unlock workdir if locked by snakemake + g) runlocal : run without submitting to sbatch +EOF +} + +function err() { cat <<< " +# +# +# + $@ +# +# +# +" && usage && exit 1 1>&2; } + +function init() { + +if [ "$#" -eq "1" ]; then err "init needs an absolute path to the working dir"; fi +if [ "$#" -gt "2" ]; then err "init takes only one more argument"; fi +WORKDIR=$2 +x=$(echo $WORKDIR|awk '{print substr($1,1,1)}') +if [ "$x" != "/" ]; then err "working dir should be supplied as an absolute path"; fi +echo "Working Dir: $WORKDIR" +if [ -d $WORKDIR ];then err "Folder $WORKDIR already exists!"; exit 1; fi +mkdir -p $WORKDIR +sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" -e "s/WORKDIR/${WORKDIR//\//\\/}/g" ${PIPELINE_HOME}/config/config.yaml > $WORKDIR/config.yaml + +#create log and stats folders +if [ ! -d $WORKDIR/logs ]; then mkdir -p $WORKDIR/logs;echo "Logs Dir: $WORKDIR/logs";fi +if [ ! -d $WORKDIR/stats ];then mkdir -p $WORKDIR/stats;echo "Stats Dir: $WORKDIR/stats";fi + +echo "Done Initializing $WORKDIR. You can now edit $WORKDIR/config.yaml and $WORKDIR/samples.tsv" + +} + +function runcheck(){ + if [ "$#" -eq "1" ]; then err "absolute path to the working dir needed"; usage; exit 1; fi + if [ "$#" -gt "2" ]; then err "too many arguments"; usage; exit 1; fi + WORKDIR=$2 + echo "Working Dir: $WORKDIR" + if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!"; exit 1; fi + module load python/3.7 + module load snakemake/5.24.1 +} + +function dryrun() { + runcheck "$@" + run "--dry-run" +} + +function unlock() { + runcheck "$@" + run "--unlock" +} + +function runlocal() { + runcheck "$@" + if [ "$SLURM_JOB_ID" == "" ];then err "runlocal can only be done on an interactive node"; exit 1; fi + module load singularity + run "local" +} + +function runslurm() { + runcheck "$@" + run "slurm" +} + +function preruncleanup() { + echo "Running..." + + cd $WORKDIR + ## check if initialized + for f in config.yaml samples.tsv; do + if [ ! -f $WORKDIR/$f ]; then err "Error: '${f}' file not found in workdir ... initialize first!";usage && exit 1;fi + done + ## Archive previous run files + if [ -f ${WORKDIR}/snakemake.log ];then + modtime=$(stat ${WORKDIR}/snakemake.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") + mv ${WORKDIR}/snakemake.log ${WORKDIR}/stats/snakemake.${modtime}.log + if [ -f ${WORKDIR}/snakemake.log.HPC_summary.txt ];then + mv ${WORKDIR}/snakemake.log.HPC_summary.txt ${WORKDIR}/stats/snakemake.${modtime}.log.HPC_summary.txt + fi + if [ -f ${WORKDIR}/snakemake.stats ];then + mv ${WORKDIR}/snakemake.stats ${WORKDIR}/stats/snakemake.${modtime}.stats + fi + fi + nslurmouts=$(find ${WORKDIR} -maxdepth 1 -name "slurm-*.out" |wc -l) + if [ "$nslurmouts" != "0" ];then + for f in $(ls ${WORKDIR}/slurm-*.out);do gzip -n $f;mv ${f}.gz ${WORKDIR}/logs/;done + fi + +} + +function postrun() { + bash ${PIPELINE_HOME}/scripts/gather_cluster_stats.sh ${WORKDIR}/snakemake.log > ${WORKDIR}/snakemake.log.HPC_summary.txt +} + +function run() { + + + if [ "$1" == "local" ];then + + preruncleanup + + snakemake -s ${PIPELINE_HOME}/$SNAKEFILE \ + --directory $WORKDIR \ + --printshellcmds \ + --use-singularity \ + --singularity-args $SINGULARITY_BINDS \ + --use-envmodules \ + --latency-wait 120 \ + --configfile ${WORKDIR}/config.yaml \ + --cores all \ + --stats ${WORKDIR}/snakemake.stats \ + 2>&1|tee ${WORKDIR}/snakemake.log + + if [ "$?" -eq "0" ];then + snakemake -s ${PIPELINE_HOME}/$SNAKEFILE \ + --report ${WORKDIR}/runlocal_snakemake_report.html \ + --directory $WORKDIR \ + --configfile ${WORKDIR}/config.yaml + fi + + postrun + + elif [ "$1" == "slurm" ];then + + preruncleanup + + cat > ${WORKDIR}/submit_script.sbatch << EOF +#!/bin/bash +#SBATCH --job-name="insert_jobname_here" +#SBATCH --mem=10g +#SBATCH --partition="ccr,norm" +#SBATCH --time=96:00:00 +#SBATCH --cpus-per-task=2 + +module load python/3.7 +module load snakemake/5.24.1 +module load singularity + +cd \$SLURM_SUBMIT_DIR + +snakemake -s $SNAKEFILE \ +--directory $WORKDIR \ +--use-singularity \ +--singularity-args $SINGULARITY_BINDS \ +--use-envmodules \ +--printshellcmds \ +--latency-wait 120 \ +--configfile ${WORKDIR}/config.yaml \ +--cluster-config ${PIPELINE_HOME}/config/cluster.json \ +--cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" \ +-j 500 \ +--rerun-incomplete \ +--keep-going \ +--stats ${WORKDIR}/snakemake.stats \ +2>&1|tee ${WORKDIR}/snakemake.log + +if [ "\$?" -eq "0" ];then + snakemake -s $SNAKEFILE \ + --directory $WORKDIR \ + --report ${WORKDIR}/runslurm_snakemake_report.html \ + --configfile ${WORKDIR}/config.yaml +fi + +bash ${PIPELINE_HOME}/scripts/gather_cluster_stats.sh ${WORKDIR}/snakemake.log > ${WORKDIR}/snakemake.log.HPC_summary.txt + +EOF + + sbatch ${WORKDIR}/submit_script.sbatch + + else + +snakemake $1 -s ${SNAKEFILE} \ +--directory $WORKDIR \ +--use-envmodules \ +--printshellcmds \ +--latency-wait 120 \ +--configfile ${WORKDIR}/config.yaml \ +--cluster-config ${PIPELINE_HOME}/config/cluster.json \ +--cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" \ +-j 500 \ +--rerun-incomplete \ +--keep-going \ +--stats ${WORKDIR}/snakemake.stats + + fi + +} + +function reset() { +if [ "$#" -eq "1" ]; then err "cleanup needs an absolute path to the existing working dir"; usage; fi +if [ "$#" -gt "2" ]; then err "cleanup takes only one more argument"; usage; fi +WORKDIR=$2 +echo "Working Dir: $WORKDIR" +if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!";fi +echo "Deleting $WORKDIR" +rm -rf $WORKDIR +echo "Re-Initializing $WORKDIR" +init "$@" +} + + +function main(){ + + if [ $# -eq 0 ]; then usage; exit 1; fi + + case $1 in + init) init "$@" && exit 0;; + dryrun) dryrun "$@" && exit 0;; + unlock) unlock "$@" && exit 0;; + run) runslurm "$@" && exit 0;; + runlocal) runlocal "$@" && exit 0;; + reset) reset "$@" && exit 0;; + -h | --help | help) usage && exit 0;; + -* | --*) err "Error: Failed to provide mode: ."; usage && exit 1;; + *) err "Error: Failed to provide mode: . '${1}' is not supported."; usage && exit 1;; + esac +} + +main "$@" + + + + +