Complying with new snakemake folder structure

CCBR · Mar 15, 2021 · b51959d · b51959d
1 parent 8ab6d2d
commit b51959d
Show file tree

Hide file tree

Showing 16 changed files with 916 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -1,43 +1,16 @@
-# CCBR Snakemake Pipeline Cookiecutter
-This is a dummy folder framework for CCBR snakemake workflows.
-New workflows can be started using this repository as a template.
+# CCBR circRNA AmpliconSeq Snakemake Workflow
+This is a snakemake workflow to process circRNA AmpliconSeq datasets generated to study circRNAs in KSHV and human hosts using divergent primers. Some basic usage instructions are as follows:
 
-## Creating PAT for GH 
-This is a prerequisite for the next step. You will need [gh cli](https://cli.github.com/) installed on your laptop or use `/data/CCBR_Pipeliner/db/PipeDB/bin/gh_1.7.0_linux_amd64/bin/gh` on biowulf. Skip if can access github in an automated way already.
-
-Personal Access Token (PAT) is required to access GitHub (GH) without having to authenticate by other means (like password) every single time. You can create a PAT by going [here](https://github.com/settings/tokens). Then you can copy the PAT and save it into a file on biowulf (say `~/gh_token`). Next, you can run the following command to set everything up correctly on biowulf (or your laptop)
-```
-gh auth login --with-token < ~/git_token
-```
-
-## Creating new repository
-You can use [gh cli](https://cli.github.com/) to
- * create a new repository under CCBR, and
- * copy over the template code from CCBR_SnakemakePipelineCookiecutter
-with the following command
-```
-gh repo create CCBR/<reponame> \
---description "<repo description>" \
---public \
---template CCBR/CCBR_SnakemakePipelineCookiecutter \
---confirm
-```
-On biowulf, you may have to specify the full path of the `gh` executable is located here: `/data/CCBR_Pipeliner/db/PipeDB/bin/gh_1.7.0_linux_amd64/bin/gh`
-
-Then you can clone a local copy of the new repository:
 ```
-git clone https://github.com/CCBR/<reponame>.git
-```
-
-If you drop the `CCBR/` from the `gh` command above, then the new repo is created under your username. The commands would then look like this:
-```
-gh repo create <reponame> \
---description "<repo description>" \
---public \
---template CCBR/CCBR_SnakemakePipelineCookiecutter \
---confirm
-
-git clone https://github.com/<your_github_handle>/<reponame>.git
-```
-
-You can change `--public` to `--private` in the above `gh` command to make the newly created repository private.
+USAGE:
+  bash ./run_circRNA_AmpliconSeq -m/--runmode=<RUNMODE> -w/--workdir=<WORKDIR>
+Required Arguments:
+1.  RUNMODE: [Type: String] Valid options:
+    *) init : initialize workdir
+    *) run : run with slurm
+    *) reset : DELETE workdir dir and re-init it
+    *) dryrun : dry run snakemake to generate DAG
+    *) unlock : unlock workdir if locked by snakemake
+    *) runlocal : run without submitting to sbatch
+2.  WORKDIR: [Type: String]: Absolute or relative path to the output folder with write permissions.
+```
diff --git a/config/config.yaml b/config/config.yaml
@@ -28,7 +28,7 @@ aggregate_quant_rowsum_filter: 50     # aggregate quant.sf across samples and fi
 #
 #
 ## you most probably dont need to change these
-scriptsdir: "PIPELINE_HOME/scripts"
+scriptsdir: "PIPELINE_HOME/workflow/scripts"
 resourcesdir: "PIPELINE_HOME/resources"
 tools: "PIPELINE_HOME/config/tools.yaml"
 cluster: "PIPELINE_HOME/config/cluster.json"

diff --git a/config/tools.yaml b/config/tools.yaml
@@ -16,6 +16,8 @@ salmon:
  version: "salmon/1.4.0"
 sambamba:
  version: "sambamba/0.8.0"
+R:
+ version: "R/4.0.3"
 # samtools:
 #  version: "samtools/1.11"
 # star:

diff --git a/run_circRNA_AmpliconSeq.bash → run_circRNA_AmpliconSeq b/run_circRNA_AmpliconSeq.bash → run_circRNA_AmpliconSeq
@@ -28,7 +28,7 @@ function get_git_commitid_tag() {
 PIPELINE_HOME=$(readlink -f $(dirname "$0"))
 echo "Pipeline Dir: $PIPELINE_HOME"
 # set snakefile
-SNAKEFILE="${PIPELINE_HOME}/circRNA_ampliconSeq.snakefile"
+SNAKEFILE="${PIPELINE_HOME}/workflow/Snakefile"
 
 # get github commit tag
 GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME)

diff --git a/circRNA_ampliconSeq.snakefile → workflow/Snakefile b/circRNA_ampliconSeq.snakefile → workflow/Snakefile
@@ -1,3 +1,6 @@
+from snakemake.utils import min_version
+min_version("5.24.1")
+
 from os.path import join
 
 include: join("rules/init.smk")

diff --git a/workflow/envs/dummy_env.yaml b/workflow/envs/dummy_env.yaml
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
@@ -0,0 +1,90 @@
+# if config['bsjfa']:
+#     BSJFA=config['bsjfa']
+# else:
+#     BSJFA=config['bsjfadefault']
+# check_readaccess(BSJFA)
+PRIMERINFO=config['primertsv']
+check_readaccess(PRIMERINFO)
+REFFA=config['reffa']
+check_readaccess(REFFA)
+
+rule create_primerbed:
+    input:
+        primertsv=PRIMERINFO
+    output:
+        primerbed=join(WORKDIR,"bowtie_index","primers.bed")
+    params:
+        script=join(SCRIPTSDIR,"build_primer_bed.py")
+    envmodules: TOOLS["python37"]["version"]
+    shell:"""
+python {params.script} {input.primertsv} > {output.primerbed}
+"""
+
+rule create_bsjfa:
+    input:
+        bed=rules.create_primerbed.output.primerbed,
+        reffa=REFFA
+    output:
+        bsjfa=join(WORKDIR,"bowtie_index","bsj.fa")
+    params:
+        script=join(SCRIPTSDIR,"generate_bsj_fasta_from_primer_bed.py")
+    container: "docker://nciccbr/ccbr_htseq_0.13.5:latest"
+    shell:"""
+python {params.script} \
+--bed {input.bed} \
+--reffa {input.reffa} \
+--outfa {output.bsjfa}
+"""
+
+rule build_index:
+    input:
+        bsjfa=rules.create_bsjfa.output.bsjfa,
+    output:
+        join(WORKDIR,"bowtie_index","bsj.1.bt2")
+    params:
+        outdir=join(WORKDIR,"bowtie_index")
+    envmodules: TOOLS["bowtie2"]["version"]
+    threads: 2
+    shell: """
+# cp {input.bsjfa} {params.outdir}
+cd {params.outdir}
+bsjfa=$(basename {input.bsjfa})
+bowtie2-build $bsjfa bsj
+# gzip -n -f {input.bsjfa}
+"""
+
+rule align:
+    input:
+        r1=rules.cutadapt.output.of1,
+        r2=rules.cutadapt.output.of2,
+        bt2_index=rules.build_index.output,
+    output:
+        bam=join(WORKDIR,"results","{sample}","bam","{sample}.bam")
+    params:
+        sample="{sample}",
+        workdir=WORKDIR,
+        outdir=join(WORKDIR,"results","{sample}","bowtie2"),
+        peorse=get_peorse,
+        bowtie2_alignment_parameters=config["bowtie2_alignment_parameters"],
+        mem=MEMORY
+    threads: 56
+    envmodules: TOOLS["bowtie2"]["version"], TOOLS["sambamba"]["version"]
+    shell:"""
+bt2_index=$(echo "{input.bt2_index}"|awk -F".1.bt2" '{{print $1}}')
+if [ "{params.peorse}" == "PE" ];then
+    bowtie2 \
+    --threads {threads} \
+    {params.bowtie2_alignment_parameters} \
+    -x $bt2_index \
+    -U {input.r1},{input.r2}
+else
+    bowtie2 \
+    --threads {threads} \
+    {params.bowtie2_alignment_parameters} \
+    -x $bt2_index \
+    -U {input.r1}
+fi \
+| awk -F"\\t" '{{if ($6 ~ /60M/ || $1 ~ /^@/){{print}}}}' \
+| sambamba view --nthreads={threads} -S --format=bam -o=/dev/stdout /dev/stdin \
+| sambamba sort --memory-limit={params.mem}G --tmpdir=/dev/shm --nthreads={threads} --out={output.bam} /dev/stdin
+"""
diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk
@@ -0,0 +1,96 @@
+import shutil
+import sys
+import os
+import pandas as pd
+import yaml
+import glob
+
+CONFIGFILE = str(workflow.overwrite_configfiles[0])
+
+def check_existence(filename):
+	"""Checks if file exists on filesystem
+	:param filename <str>: Name of file to check
+	"""
+	filename=filename.strip()
+	if not os.path.exists(filename):
+		sys.exit("File: {} does not exists!".format(filename))
+	return True
+
+
+def check_readaccess(filename):
+	"""Checks permissions to see if user can read a file
+	:param filename <str>: Name of file to check
+	"""
+	filename=filename.strip()
+	check_existence(filename)
+	if not os.access(filename,os.R_OK):
+		sys.exit("File: {} exists, but user cannot read from file due to permissions!".format(filename))
+	return True
+
+
+def check_writeaccess(filename):
+	"""Checks permissions to see if user can write to a file
+	:param filename <str>: Name of file to check
+	"""
+	filename=filename.strip()
+	check_existence(filename)
+	if not os.access(filename,os.W_OK):
+		sys.exit("File: {} exists, but user cannot write to file due to permissions!".format(filename))
+	return True
+
+
+#
+MEMORY="100"
+# get working dir from config
+WORKDIR = config["workdir"]
+
+# get resources folder
+try:
+	RESOURCESDIR = config["resourcesdir"]
+except KeyError:
+	RESOURCESDIR = join(WORKDIR,"resources")
+
+# get scripts folder
+try:
+	SCRIPTSDIR = config["scriptsdir"]
+except KeyError:
+	SCRIPTSDIR = join(WORKDIR,"scripts")
+
+## Load tools from YAML file
+with open(config["tools"]) as f:
+	TOOLS = yaml.safe_load(f)
+
+if not os.path.exists(join(WORKDIR,"fastqs")):
+	os.mkdir(join(WORKDIR,"fastqs"))
+if not os.path.exists(join(WORKDIR,"results")):
+	os.mkdir(join(WORKDIR,"results"))
+for f in ["samples", "tools", "cluster"]:
+	check_readaccess(config[f])
+
+SAMPLESDF = pd.read_csv(config["samples"],sep="\t",header=0,index_col="sampleName")
+SAMPLES = list(SAMPLESDF.index)
+SAMPLESDF["R1"]=join(RESOURCESDIR,"dummy")
+SAMPLESDF["R2"]=join(RESOURCESDIR,"dummy")
+SAMPLESDF["PEorSE"]="PE"
+
+for sample in SAMPLES:
+	R1file=SAMPLESDF["path_to_R1_fastq"][sample]
+	R2file=SAMPLESDF["path_to_R2_fastq"][sample]
+	# print(sample,R1file,R2file)
+	check_readaccess(R1file)
+	R1filenewname=join(WORKDIR,"fastqs",sample+".R1.fastq.gz")
+	if not os.path.exists(R1filenewname):
+		os.symlink(R1file,R1filenewname)
+	SAMPLESDF.loc[[sample],"R1"]=R1filenewname
+	if str(R2file)!='nan':
+		check_readaccess(R2file)
+		R2filenewname=join(WORKDIR,"fastqs",sample+".R2.fastq.gz")
+		if not os.path.exists(R2filenewname):
+			os.symlink(R2file,R2filenewname)
+		SAMPLESDF.loc[[sample],"R2"]=R2filenewname
+	else:
+		SAMPLESDF.loc[[sample],"PEorSE"]="SE"
+# print(SAMPLESDF)
+# sys.exit()
+
+
diff --git a/workflow/rules/quant.smk b/workflow/rules/quant.smk
@@ -0,0 +1,53 @@
+BSJFA=rules.create_bsjfa.output.bsjfa
+
+rule salmon:
+    input:
+        bam=rules.align.output.bam,
+        reffa=BSJFA,
+    output:
+        sf=join(WORKDIR,"results","{sample}","salmon","quant.sf")
+    params:
+        salmon_parameters=config["salmon_parameters"]
+    envmodules: TOOLS["salmon"]["version"]
+    shell:"""
+outdir=$(dirname {output.sf})
+salmon quant \
+--threads {threads} \
+{params.salmon_parameters} \
+-t {input.reffa} \
+-a {input.bam} \
+--output $outdir
+"""
+
+rule aggregate_quant:
+    input:
+        expand(join(WORKDIR,"results","{sample}","salmon","quant.sf"),sample=SAMPLES)
+    output:
+        mergedquant=join(WORKDIR,"results","mergedquant.tsv"),
+        filteredmergedquant=join(WORKDIR,"results","mergedquant.filtered.tsv")
+    params:
+        rowsumfilter=config['aggregate_quant_rowsum_filter'],
+        plotsdir=join(WORKDIR,"results","pieplots"),
+        rscript=join(SCRIPTSDIR,"pie_plots.R")
+    envmodules: TOOLS["salmon"]["version"],TOOLS["R"]["version"]
+    shell:"""
+names=""
+quants=""
+for i in {input};do
+    outdir=$(dirname $i)
+    quants="$quants $outdir"
+    samplename=$(echo $outdir|awk -F"/" '{{print $(NF-1)}}')
+    names="$names $samplename"
+done
+salmon quantmerge --quants $quants --names $names --column numreads -o {output.mergedquant}
+head -n1 {output.mergedquant} > {output.filteredmergedquant}
+tail -n +2 {output.mergedquant} |\
+awk -F"\\t" -v r={params.rowsumfilter} -v OFS="\\t" '{{for(i=2;i<=NF;i++){{sum[NR]=sum[NR]+$i}};if (sum[NR] >= r) {{print sum[NR],$0}}}}' |\
+sort -k1,1gr |\
+cut -f2- >> {output.filteredmergedquant}
+if [ -d {params.plotsdir} ];then rm -rf {params.plotsdir};fi && mkdir {params.plotsdir} && cd {params.plotsdir} && cp {output.filteredmergedquant} . && \
+Rscript {params.rscript} $(basename {output.filteredmergedquant}) && rm -f $(basename {output.filteredmergedquant})
+"""
+
+
+
diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
@@ -0,0 +1,51 @@
+def get_fastqs(wildcards):
+    d=dict()
+    d["R1"]=SAMPLESDF["R1"][wildcards.sample]
+    d["R2"]=SAMPLESDF["R2"][wildcards.sample]
+    return d
+
+def get_peorse(wildcards):
+    return SAMPLESDF["PEorSE"][wildcards.sample]
+
+rule cutadapt:
+    input:
+        unpack(get_fastqs)
+    output:
+        of1=join(WORKDIR,"results","{sample}","trim","{sample}.R1.trim.fastq.gz"),
+        of2=join(WORKDIR,"results","{sample}","trim","{sample}.R2.trim.fastq.gz")
+    params:
+        sample="{sample}",
+        workdir=WORKDIR,
+        outdir=join(WORKDIR,"results","{sample}"),
+        peorse=get_peorse,
+        adapters=join(RESOURCESDIR,"TruSeq_and_nextera_adapters.consolidated.fa")
+    envmodules: TOOLS["cutadapt"]["version"]
+    threads: 56
+    shell:"""
+if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi
+if [ "{params.peorse}" == "PE" ];then
+    ## Paired-end
+    cutadapt --pair-filter=any \
+    --nextseq-trim=2 \
+    --trim-n \
+    -n 5 -O 5 \
+    -q 10,10 -m 35:35 \
+    -b file:{params.adapters} \
+    -B file:{params.adapters} \
+    -j {threads} \
+    -o {output.of1} -p {output.of2} \
+    {input.R1} {input.R2}
+else
+    ## Single-end
+    cutadapt \
+    --nextseq-trim=2 \
+    --trim-n \
+    -n 5 -O 5 \
+    -q 10,10 -m 35 \
+    -b file:{params.adapters} \
+    -j {threads} \
+    -o {output.of1} \
+    {input.R1}
+    touch {output.of2}
+fi
+"""