diff --git a/charlie b/charlie index 24e1fbd..1c91cc1 100755 --- a/charlie +++ b/charlie @@ -52,31 +52,34 @@ GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME) PYTHONVERSION="3" SNAKEMAKEVERSION="7" -CLUSTER_SBATCH_CMD="sbatch --parsable --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" -PARTITION='norm' CONDA_ACTIVATE='' PATH_PREPEND='' MODULE_LOAD='' PLATFORM=$(get_platform) +PARTITION='norm' +EXTRA_SINGULARITY_BINDS="" +TEMP_DIR="" +REFS_DIR="" +CLUSTER_PROFILE="config/unknown" if [ "$PLATFORM" == "biowulf" ]; then - EXTRA_SINGULARITY_BINDS="/lscratch" - CLUSTER_SBATCH_CMD="$CLUSTER_SBATCH_CMD --gres {cluster.gres}" + CLUSTER_PROFILE="config/slurm-biowulf" PARTITION="ccr,$PARTITION" + EXTRA_SINGULARITY_BINDS="/lscratch" CONDA_ACTIVATE='. "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" && conda activate py311' MODULE_LOAD="module load python/$PYTHONVERSION snakemake/$SNAKEMAKEVERSION singularity; $CONDA_ACTIVATE" + TEMP_DIR='/lscratch/$SLURM_JOB_ID/' + REFS_DIR="/gpfs/gsfs10/users/CCBR_Pipeliner/db/PipeDB/charlie/fastas_gtfs/" elif [ "$PLATFORM" == "fnlcr" ]; then + CLUSTER_PROFILE="config/slurm-fnlcr" EXTRA_SINGULARITY_BINDS="/scratch/local" - # activate conda env CONDA_ACTIVATE=". '/mnt/projects/CCBR-Pipelines/resources/miniconda3/etc/profile.d/conda.sh' && conda activate py311" # make sure spooker is in the path PATH_PREPEND='export PATH="/mnt/projects/CCBR-Pipelines/bin:$PATH"' MODULE_LOAD="module load singularity; $PATH_PREPEND; $CONDA_ACTIVATE" + TEMP_DIR="/scratch/local/" + REFS_DIR="/mnt/projects/CCBR-Pipelines/db/charlie/fastas_gtfs/" else - EXTRA_SINGULARITY_BINDS="" - echo """WARNING: detected platform is $PLATFORM. Please edit the following files for compatibility with your computing environment: - config.yaml - cluster.json - submit_script.sbatch + echo """WARNING: detected platform is $PLATFORM. Please edit the files in config/unknown/ & config.yaml for compatibility with your computing environment """ fi @@ -213,22 +216,26 @@ function init() { if [ -d $WORKDIR ];then err "Folder $WORKDIR already exists!"; fi mkdir -p $WORKDIR -# copy config and samples files +# copy config resources +cp -r ${PIPELINE_HOME}/config $WORKDIR/ + +# copy config template and samples files if [ ! -f $CONFIGFILE ];then sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" \ -e "s/WORKDIR/${WORKDIR//\//\\/}/g" \ -e "s/HOST/${HOST}/g" \ -e "s/ADDITIVES/${ADDITIVES}/g" \ -e "s/VIRUSES/${VIRUSES}/g" \ - ${PIPELINE_HOME}/config/$PLATFORM/config.yaml |\ - cat - ${PIPELINE_HOME}/config/containers.yaml > $CONFIGFILE + -e "s/TEMP_DIR/${TEMP_DIR//\//\\/}/g" \ + -e "s/REFS_DIR/${REFS_DIR//\//\\/}/g" \ + -e "s|CLUSTER_PROFILE|${CLUSTER_PROFILE}|g" \ + ${PIPELINE_HOME}/config/config.yaml \ + > $CONFIGFILE fi if [ ! -f $WORKDIR/nclscan.config ];then sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" -e "s/WORKDIR/${WORKDIR//\//\\/}/g" ${PIPELINE_HOME}/resources/NCLscan.config.template > $WORKDIR/nclscan.config fi -if [ ! -f $CLUSTERFILE ];then -cp ${PIPELINE_HOME}/config/$PLATFORM/cluster.json $CLUSTERFILE -fi + if [ ! -f $WORKDIR/samples.tsv ];then cp $MANIFEST $WORKDIR/samples.tsv fi @@ -247,7 +254,7 @@ echo "Done Initializing $WORKDIR. You can now edit $WORKDIR/config.yaml and $WOR function check_essential_files() { if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!"; fi - for f in config.yaml samples.tsv nclscan.config cluster.json; do + for f in config.yaml samples.tsv nclscan.config; do if [ ! -f $WORKDIR/$f ]; then err "Error: '${f}' file not found in workdir ... initialize first!";fi done } @@ -299,8 +306,11 @@ function reconfig(){ -e "s/HOST/${HOST}/g" \ -e "s/ADDITIVES/${ADDITIVES}/g" \ -e "s/VIRUSES/${VIRUSES}/g" \ - ${PIPELINE_HOME}/config/$PLATFORM/config.yaml |\ - cat - ${PIPELINE_HOME}/config/containers.yaml > $CONFIGFILE + -e "s/TEMP_DIR/${TEMP_DIR//\//\\/}/g" \ + -e "s/REFS_DIR/${REFS_DIR//\//\\/}/g" \ + -e "s|CLUSTER_PROFILE|${CLUSTER_PROFILE}|g" \ + ${PIPELINE_HOME}/config/config.yaml \ + > $CONFIGFILE echo "$WORKDIR/config.yaml has been updated!" } @@ -523,9 +533,7 @@ snakemake -s $SNAKEFILE \ --printshellcmds \ --latency-wait 300 \ --configfile $CONFIGFILE \ - --cluster-config $CLUSTERFILE \ - --cluster "$CLUSTER_SBATCH_CMD" \ - --cluster-status $CLUSTERSTATUSCMD \ + --profile $CLUSTER_PROFILE \ -j 500 \ --rerun-incomplete \ --rerun-triggers $trigger \ @@ -554,8 +562,6 @@ EOF else # dry-run and unlock - echo $CLUSTER_SBATCH_CMD - snakemake $1 -s $SNAKEFILE \ --directory $WORKDIR \ --use-envmodules \ @@ -564,8 +570,7 @@ EOF --printshellcmds \ --latency-wait 300 \ --configfile $CONFIGFILE \ - --cluster-config $CLUSTERFILE \ - --cluster "$CLUSTER_SBATCH_CMD" \ + --profile $CLUSTER_PROFILE \ -j 500 \ --rerun-incomplete \ --rerun-triggers $trigger \ @@ -656,8 +661,6 @@ function main(){ # required files CONFIGFILE="${WORKDIR}/config.yaml" - CLUSTERFILE="${WORKDIR}/cluster.json" - CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh" # change group to Ziegelbauer_lab before doing anything if [ "$CHANGEGRP" == "1" ]; then change_grp "$allargs"; fi diff --git a/config/biowulf/config.yaml b/config/config.yaml similarity index 86% rename from config/biowulf/config.yaml rename to config/config.yaml index 87e534d..c94dce0 100644 --- a/config/biowulf/config.yaml +++ b/config/config.yaml @@ -4,7 +4,7 @@ workdir: "WORKDIR" # temporary directory for intermediate files that are not saved -tempdir: "/lscratch/$SLURM_JOB_ID" +tempdir: "TEMP_DIR" # tab delimited samples file ... should have the following 3 columns # sampleName path_to_R1_fastq path_to_R2_fastq @@ -90,7 +90,7 @@ resourcesdir: "PIPELINE_HOME/resources" # default cluster # cluster: "PIPELINE_HOME/resources/cluster.json" -cluster: "WORKDIR/cluster.json" +cluster: "WORKDIR/CLUSTER_PROFILE/cluster.json" adapters: "PIPELINE_HOME/resources/TruSeq_and_nextera_adapters.consolidated.fa" circexplorer_bsj_circRNA_min_reads: 3 # in addition to "known" and "low-conf" circRNAs identified by circexplorer, we also include those found in back_spliced.bed file but not classified as known/low-conf only if the number of reads supporting the BSJ call is greater than this number @@ -107,8 +107,25 @@ high_confidence_core_callers_plus_n: 1 ciri_perl_script: "/opt2/CIRI_v2.0.6/CIRI2.pl" # path in docker container # change this path to a directory containing fasta and GTF files for all host and virus genomes -fastas_gtfs_dir: "/gpfs/gsfs10/users/CCBR_Pipeliner/db/PipeDB/charlie/fastas_gtfs" +fastas_gtfs_dir: "REFS_DIR" annotation_lookups: hg38: "PIPELINE_HOME/resources/hg38_2_hg19_lookup.txt" mm39: "PIPELINE_HOME/resources/mm39_circBase_annotation_lookup.txt" + +containers: + base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v7" + bowtie1: "docker://nciccbr/charlie_bowtie1:v0.1.1" + circexplorer: "docker://nciccbr/ccbr_circexplorer:v1.0" + circRNA_finder: "docker://nciccbr/charlie_circrna_finder:v1.0.1" + ciri: "docker://nciccbr/charlie_ciri2:v1.0.1" + clear: "docker://nciccbr/ccbr_clear:v2.0.1" + cutadapt: "docker://nciccbr/charlie_cutadapt_fqfilter:v1.0.1" + dcc: "docker://nciccbr/charlie_dcc:v0.2.1" + fastqc: "docker://nciccbr/ccrgb_qctools:v4.0" + mapsplice: "docker://cgrlab/mapsplice2:latest" + multiqc: "docker://nciccbr/ccbr_multiqc_1.15:v1" + picard: "docker://nciccbr/ccbr_picard_2.27.5:v1" + R: "docker://nciccbr/ccbr_r_4.3.0:v1" + star: "docker://nciccbr/ccbr_star_2.7.6a:latest" + star_ucsc_cufflinks: "docker://nciccbr/charlie_star_ucsc_cufflinks:v0.4.1" diff --git a/config/containers.yaml b/config/containers.yaml deleted file mode 100644 index 2f5f5ed..0000000 --- a/config/containers.yaml +++ /dev/null @@ -1,16 +0,0 @@ -containers: - base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v7" - bowtie1: "docker://nciccbr/charlie_bowtie1:v0.1.1" - circexplorer: "docker://nciccbr/ccbr_circexplorer:v1.0" - circRNA_finder: "docker://nciccbr/charlie_circrna_finder:v1.0.1" - ciri: "docker://nciccbr/charlie_ciri2:v1.0.1" - clear: "docker://nciccbr/ccbr_clear:v2.0.1" - cutadapt: "docker://nciccbr/charlie_cutadapt_fqfilter:v1.0.1" - dcc: "docker://nciccbr/charlie_dcc:v0.2.1" - fastqc: "docker://nciccbr/ccrgb_qctools:v4.0" - mapsplice: "docker://cgrlab/mapsplice2:latest" - multiqc: "docker://nciccbr/ccbr_multiqc_1.15:v1" - picard: "docker://nciccbr/ccbr_picard_2.27.5:v1" - R: "docker://nciccbr/ccbr_r_4.3.0:v1" - star: "docker://nciccbr/ccbr_star_2.7.6a:latest" - star_ucsc_cufflinks: "docker://nciccbr/charlie_star_ucsc_cufflinks:v0.4.1" diff --git a/config/fnlcr/config.yaml b/config/fnlcr/config.yaml deleted file mode 100644 index 86e0f40..0000000 --- a/config/fnlcr/config.yaml +++ /dev/null @@ -1,114 +0,0 @@ -## you probably need to change or comment/uncomment some of these -# -# The working dir... output will be in the results subfolder of the workdir -workdir: "WORKDIR" - -# temporary directory for intermediate files that are not saved -tempdir: "/scratch/local" - -# tab delimited samples file ... should have the following 3 columns -# sampleName path_to_R1_fastq path_to_R2_fastq -samples: "WORKDIR/samples.tsv" - -# Should the CLEAR pipeline be run? True or False WITHOUT quotes -run_clear: True - -# Should the DCC pipeline be run? True or False WITHOUT quote -run_dcc: True - -# Should the MapSplice pipeline be run? True or False WITHOUT quotes -run_mapsplice: False -mapsplice_min_map_len: 50 -mapsplice_filtering: 2 # 1=less stringent 2=default - -# Should the circRNA_finder be run? True or False WITHOUT quotes -run_circRNAFinder: True -# Should the NCLscan pipeline be run? True or False WITHOUT quotes -# This can only be run for PE data -run_nclscan: False -nclscan_config: "WORKDIR/nclscan.config" - -# Should we also run find_circ? True or False WITHOUT quotes -run_findcirc: False -# findcirc_params: "--noncanonical --allhits" # this gives way too many circRNAs -findcirc_params: "--noncanonical" - -# select references .... host + viruses(comma-separated): -# select host: # options are hg38 or mm39 -# host: "hg38" -# additives: "ERCC" # options are ERCC and BAC16Insert -# viruses: "NC_009333.1" -host: "HOST" -additives: "ADDITIVES" -viruses: "VIRUSES" -# select viruses and other (ERCC/BAC): options are -# ERCC -# BAC16Insert -# -# | RefSeq Sequence | RefSeq assembly accession | Notes | -# | ---------------- | ------------------------- | ----------------------------------------------------- | -# | NC_007605.1 | GCF_002402265.1 | Human gammaherpesvirus 4 (Epstein-Barr virus) | -# | NC_000898.1 | GCF_000846365.1 | Human betaherpesvirus 6B | -# | NC_001664.4 | GCF_000845685.2 | Human betaherpesvirus 6A | -# | NC_001716.2 | GCF_000848125.1 | Human betaherpesvirus 7 | -# | NC_006273.2 | GCF_000845245.1 | Human betaherpesvirus 5 | -# | NC_009333.1 | GCF_000838265.1 | Human gammaherpesvirus 8 | -# | NC_045512.2 | GCF_009858895.2 | Severe acute respiratory syndrome-related coronavirus | -# | MN485971.1 | xx | HIV from Belgium ... GTF is hand curated | -# -# | RefSeq Sequence | RefSeq assembly accession | Notes | -# | ---------------- | ------------------------- | ------------------------------------------------------------ | -# | NC_001806.2 | GCF_000859985.2 | [Human alphaherpesvirus 1 (Herpes simplex virus type 1)](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=10298&lvl=3&lin=f&keep=1&srchmode=1&unlock) (strain 17) | -# -# | RefSeq Sequence | RefSeq assembly accession | Notes | -# | ---------------- | ------------------------- | ------------------------------------------------------------ | -# | KT899744.1 | KT899744.1 | HSV-1 strain KOS | -# | MH636806.1 | MH636806.1 | MHV68 (Murine herpesvirus 68 strain WUMS) | -# -# comma separated list -# STAR 1-pass junction filtering.... -# 1st pass of STAR generates a list of splice junctions which are filtered to be parsed to the second pass of STAR -# Separate filters can be applied to the "host"+"additives" and "viruses" defined above -# Typically, since "host"+"additives" annotations are much more well-established we filter out noncanonical and unannotated -# while keeping everything for the poorly annotated viruses -star_1pass_filter_host_noncanonical: "True" -star_1pass_filter_host_unannotated: "True" -star_1pass_filter_viruses_noncanonical: "False" -star_1pass_filter_viruses_unannotated: "False" - -alignTranscriptsPerReadNmax: "20000" - -# BSJ filters in bp: -minsize_host: 150 -minsize_virus: 150 -maxsize_host: 1000000000 -maxsize_virus: 5000 - -## you most probably dont need to change these -scriptsdir: "PIPELINE_HOME/workflow/scripts" -resourcesdir: "PIPELINE_HOME/resources" - -# default cluster -# cluster: "PIPELINE_HOME/resources/cluster.json" -cluster: "WORKDIR/cluster.json" - -adapters: "PIPELINE_HOME/resources/TruSeq_and_nextera_adapters.consolidated.fa" -circexplorer_bsj_circRNA_min_reads: 3 # in addition to "known" and "low-conf" circRNAs identified by circexplorer, we also include those found in back_spliced.bed file but not classified as known/low-conf only if the number of reads supporting the BSJ call is greater than this number -minreadcount: 3 # this is used to filter circRNAs while creating the per-sample counts table -flanksize: 18 # 18bp flank on either side of the BSJ .. used by multiple BSJ callers -dcc_strandedness: "-ss" # "-ss" for stranded library and "--nonstrand" for unstranded -cutadapt_min_length: 15 -cutadapt_n: 5 -cutadapt_max_n: 0.5 -cutadapt_O: 5 -cutadapt_q: 20 -high_confidence_core_callers: "circExplorer,circExplorer_bwa" -high_confidence_core_callers_plus_n: 1 - -ciri_perl_script: "/opt2/CIRI_v2.0.6/CIRI2.pl" # path in docker container -# change this path to a directory containing fasta and GTF files for all host and virus genomes -fastas_gtfs_dir: "/mnt/projects/CCBR-Pipelines/db/charlie/fastas_gtfs" - -annotation_lookups: - hg38: "PIPELINE_HOME/resources/hg38_2_hg19_lookup.txt" - mm39: "PIPELINE_HOME/resources/mm39_circBase_annotation_lookup.txt" diff --git a/config/biowulf/cluster.json b/config/slurm-biowulf/cluster.json similarity index 100% rename from config/biowulf/cluster.json rename to config/slurm-biowulf/cluster.json diff --git a/resources/cluster_status.sh b/config/slurm-biowulf/cluster_status.sh similarity index 99% rename from resources/cluster_status.sh rename to config/slurm-biowulf/cluster_status.sh index 262d9ed..c65b3d4 100755 --- a/resources/cluster_status.sh +++ b/config/slurm-biowulf/cluster_status.sh @@ -16,4 +16,4 @@ then echo running else echo failed -fi \ No newline at end of file +fi diff --git a/config/slurm-biowulf/config.yaml b/config/slurm-biowulf/config.yaml new file mode 100644 index 0000000..0697a63 --- /dev/null +++ b/config/slurm-biowulf/config.yaml @@ -0,0 +1,23 @@ +cluster: sbatch + --parsable + --cpus-per-task {cluster.threads} + -p {cluster.partition} + -t {cluster.time} + --mem {cluster.mem} + --job-name {cluster.name} + --output {cluster.output} + --error {cluster.error} + --gres {cluster.gres} +cluster-config: "cluster.json" +cluster-status: "cluster_status.sh" +jobs: 499 +immediate-submit: false +verbose: true +notemp: true +latency-wait: 300 +printshellcmds: true +use-singularity: true +rerun-incomplete: true +rerun-triggers: mtime +retries: 2 +keep-going: true diff --git a/config/fnlcr/cluster.json b/config/slurm-fnlcr/cluster.json similarity index 100% rename from config/fnlcr/cluster.json rename to config/slurm-fnlcr/cluster.json diff --git a/config/slurm-fnlcr/cluster_status.sh b/config/slurm-fnlcr/cluster_status.sh new file mode 100755 index 0000000..c65b3d4 --- /dev/null +++ b/config/slurm-fnlcr/cluster_status.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Check status of Slurm job +jobid="$1" +if [[ "$jobid" == Submitted ]] +then + echo smk-simple-slurm: Invalid job ID: "$jobid" >&2 + echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2 + exit 1 +fi +output=`sacct -j "$jobid" --format State --noheader | head -n 1 | awk '{print $1}'` +if [[ $output =~ ^(COMPLETED).* ]] +then + echo success +elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]] +then + echo running +else + echo failed +fi diff --git a/config/slurm-fnlcr/config.yaml b/config/slurm-fnlcr/config.yaml new file mode 100644 index 0000000..8fa374f --- /dev/null +++ b/config/slurm-fnlcr/config.yaml @@ -0,0 +1,22 @@ +cluster: sbatch + --parsable + --cpus-per-task {cluster.threads} + -p {cluster.partition} + -t {cluster.time} + --mem {cluster.mem} + --job-name {cluster.name} + --output {cluster.output} + --error {cluster.error} +cluster-config: "cluster.json" +cluster-status: "cluster_status.sh" +jobs: 499 +immediate-submit: false +verbose: true +notemp: true +latency-wait: 300 +printshellcmds: true +use-singularity: true +rerun-incomplete: true +rerun-triggers: mtime +retries: 2 +keep-going: true