diff --git a/BUGS.md b/BUGS.md new file mode 100644 index 0000000..63df50a --- /dev/null +++ b/BUGS.md @@ -0,0 +1,6 @@ + * For some reason, load(tidyverse) does not work correctly in the + environment seasnap-de. However, dplyr and other base tidyverse packages + load without issue. + + * cluster profile doesn't work, but only when called from the pipeline. + Running the script interactively works well. diff --git a/DE_pipeline.snake b/DE_pipeline.snake index 91802d0..42b4980 100644 --- a/DE_pipeline.snake +++ b/DE_pipeline.snake @@ -64,11 +64,14 @@ def get_inputs_all(): #functional annotation inputs.append(pph.expand_path(step = "goseq", extension = "go.rds", if_set = dict(goseq=True) )) inputs.append(pph.expand_path(step = "goseq", extension = "kegg.rds", if_set = dict(goseq=True) )) - inputs.append(pph.file_path(step = "annotation", extension = "rds", contrast="all")) + #inputs.append(pph.file_path(step = "annotation", extension = "rds", contrast="all")) inputs.append(pph.file_path(step = "export_raw_counts", extension = "xlsx", contrast = "all")) inputs.append(pph.expand_path(step = "cluster_profiler", extension = "rds", if_set = dict(cluster_profiler=dict(run=True)) )) - inputs.append(pph.file_path(step = "tmod_dbs", extension = "rds", contrast="all")) - inputs.append(pph.file_path(step = "tmod_pca", extension = "rds", contrast="all", if_set=dict(tmod_pca=True))) + # XXX uncomment the following line to run tmod_pca + + if config["contrasts"]["defaults"]["tmod_pca"]: + inputs.append(pph.file_path(step = "tmod_pca", extension = "rds", contrast="all")) + inputs.append(pph.expand_path(step = "tmod", extension = "rds", if_set = dict(tmod=True))) #time series diff --git a/README.md b/README.md index 08833ad..bad234c 100644 --- a/README.md +++ b/README.md @@ -46,21 +46,39 @@ Quick-Start After cloning this git repository: ``` -git clone git@cubi-gitlab.bihealth.org:CUBI/Pipelines/sea-snap.git +git clone git@cubi-gitlab.bihealth.org:CUBI/Pipelines/seasnap-pipeline.git ``` all required tools and packages can be installed via conda. -Download and install them into a new environment called `sea_snap`: + +Currently there are two separate conda environments, one for the mapping +pipeline and one for the DE pipeline + +Download and install them into new environments called `sea_snap_mapping` +and `sea_snap_de`: ``` -conda env create -f conda_env.yaml +conda env create -f conda_env_mapping.yaml +conda env create -f conda_env_DE.yaml ``` -The file `conda_env.yaml` is located in the main directory of the git repository. +The files `conda_env_mapping.yaml` and `conda_env_DE.yaml` are located in the main directory of the git repository. Each time before using SeA-SnaP, activate the environment with: ``` -conda activate sea_snap +conda activate seasnap-mapping +``` + +or +``` +conda activate seasnap-de +``` + +Finally, run the following command in the seasnap-de environment: + +``` +conda activate seasnap-de +Rscript install_r_packages.R ``` ### Running the pipeline @@ -90,7 +108,6 @@ The next steps depend on, whether you want to run: - [**`The mapping pipeline`**](documentation/run_mapping.md) - [**`The DE pipeline`**](documentation/run_DE.md) -- [**`The sc pipeline`**](documentation/run_sc.md) The results of an analysis can also be [`exported`](documentation/export.md) to a new folder structure, e.g. to upload them to SODAR. diff --git a/TODO.md b/TODO.md index 55b1835..fca182e 100644 --- a/TODO.md +++ b/TODO.md @@ -2,5 +2,8 @@ The roadmap ahead: 1) merging Eric's branches and the ATAC_seq branch 2) making sure sea-snap runs smoothly with newer versions of snakemake (it -doesn't currently) -3) making the step from drmaa to cluster profiles. +doesn't currently) --> DONE for mapping pipeline if snakemake version = 7.19.1 +3) making the step from drmaa to cluster profiles --> mostly DONE for mapping pipeline +4) optimizing resource requirements in `mapping_pipeline.snake` (check `mem` vs `mem_per_cpu`) and adding them to `DE_pipeline.snake` +5) allowing sample-specific indices for bwa / salmon / kallisto? +6) adapting the `conda_env.yaml` files diff --git a/cluster_config_drmaa.json b/cluster_config_drmaa.json index 03e7a6f..4852c23 100644 --- a/cluster_config_drmaa.json +++ b/cluster_config_drmaa.json @@ -33,7 +33,7 @@ "salmon": { - "h_vmem": "4000", + "h_vmem": "4000", "h_rt": "40:00:00", "pe": "8" }, @@ -108,19 +108,12 @@ "pe": "4" }, - "cellranger_count": - { - "h_vmem": "4000", - "h_rt": "40:00:00", + "cluster_profiler": + { "h_vmem": "20000", + "h_rt": "10:00:00", "pe": "1" - }, - - "velocyto_run": - { - "h_vmem": "20000", - "h_rt": "40:00:00", - "pe": "8" } + } diff --git a/cluster_config_slurm.json b/cluster_config_slurm.json index 80823d7..49526b8 100644 --- a/cluster_config_slurm.json +++ b/cluster_config_slurm.json @@ -3,7 +3,7 @@ { "snake_opt": "-j 100 -k --restart-times 0 --max-jobs-per-second 5 --rerun-incomplete", - "run_command": "--profile \"cubi-v1\"" + "run_command": "--profile \"cubi-dev\"" }, "__default__": @@ -19,15 +19,15 @@ "star": { - "mem": "20000M", - "time": "40:00:00", + "mem": "60000M", + "time": "8:00:00", "cpus-per-task": 8 }, "star_index": { - "mem": "20000M", - "time": "40:00:00", + "mem": "40000M", + "time": "8:00:00", "cpus-per-task": 8 }, @@ -120,6 +120,35 @@ "mem": "20000M", "time": "40:00:00", "cpus-per-task": 8 + }, + "preseq_lc_extrap": + { + "mem": "40000M", + "time": "40:00:00", + "cpus-per-task": 8 + }, + "preseq_c_curve": + { + "mem": "40000M", + "time": "40:00:00", + "cpus-per-task": 8 + }, + "tmod_dbs": + { + "mem": "16000M", + "time": "8:00:00", + "cpus-per-task": 8 + }, + + "cluster_profiler": + { + "mem": "8000M", + "time": "4:00:00", + "cpus-per-task": 8 } + + } + + diff --git a/conda_env.yaml b/conda_env_DE.yaml similarity index 68% rename from conda_env.yaml rename to conda_env_DE.yaml index 498f1ab..7a46865 100644 --- a/conda_env.yaml +++ b/conda_env_DE.yaml @@ -1,35 +1,13 @@ -name: sea_snap +name: seasnap-de channels: - - bioconda - conda-forge + - bioconda - defaults dependencies: - - python - - r - - - bamtools - - bedtools - - fastqc - - macs2 - - multiqc - - picard - - preseq - - qualimap - - rna-seqc - - rsa - - rsem - - rseqc - - salmon - - samtools - - sqlite - - star - - subread - - tpmcalculator - - trimadap - - vim - - rpy2 - - - bioconductor-deseq2 + - python>=3.9 + - snakemake=7.19 + - r-base>=4.1 + - bioconductor-deseq2=1.38 - bioconductor-genomicfeatures - bioconductor-annotationdbi - bioconductor-org.hs.eg.db # annotation for human @@ -37,7 +15,6 @@ dependencies: - bioconductor-tximport - bioconductor-goseq - bioconductor-apeglm - - bioconductor-dupradar - bioconductor-biomart - bioconductor-enrichplot - bioconductor-clusterprofiler @@ -48,7 +25,6 @@ dependencies: - bioconductor-rtracklayer - r-blob - r-cairo - - r-cellranger - r-cowplot - r-crayon - r-devtools @@ -57,20 +33,26 @@ dependencies: - r-ggplotify - r-readr - r-readxl + - r-writexl - r-roxygen2 - r-tidyr - r-tidyselect - r-testthat + - r-remotes - r-ashr - r-pheatmap - r-shiny - - r-msigdbr + - r-msigdbr>=7.4 - r-dplyr - r-biocmanager - r-xml2 - r-xopen - + - r-tmod>=0.50.11 +# Required for the actual report + - r-dt + - r-purrr + - r-plotly + - r-pander + - bioconductor-vsn + - r-tidyverse - drmaa - - snakemake - - diff --git a/conda_env_mapping.yaml b/conda_env_mapping.yaml new file mode 100644 index 0000000..cd9cdb4 --- /dev/null +++ b/conda_env_mapping.yaml @@ -0,0 +1,28 @@ +name: seasnap-mapping +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python>=3.9 + - snakemake=7.19 + - bamtools + - bedtools + - fastqc + - macs2 + - multiqc + - picard + - preseq + - qualimap + - rna-seqc + - rsem + - salmon + - samtools=1.16 + - sqlite + - star=2.7.3a + - subread=2.0 + - trimadap + - drmaa + - bioconductor-dupradar + - rseqc + - bwa diff --git a/defaults/sc_config_defaults.yaml b/defaults/sc_config_defaults.yaml deleted file mode 100644 index 6852f44..0000000 --- a/defaults/sc_config_defaults.yaml +++ /dev/null @@ -1,67 +0,0 @@ -#---------------------------------------- general pipeline parameters -pipeline_param: - # adjust pattern of stored files - out_path_pattern: sc_analysis/{step}/{sample}.{mate}/out/{step}.{sample}.{mate}.{extension} - log_path_pattern: sc_analysis/{step}/{sample}.{mate}/report/{step}.{sample}.{mate}.{extension} - in_path_pattern: ../input/{sample}/{sample}.{mate} - - test_config: true - report_snippets: "" - - feature_ref: "" - cellranger_executable: "cellranger" # default assuming cellranger is in the PATH - - # adjust which results are produced - produce_results: - - cellranger - - velocyto - - input_choice: - velocyto: - - cellranger_count - - cellranger_aggr - -#---------------------------------------- organism annotation -organism_defaults: null - -#---------------------------------------- parameters for rules -rule_options: - cellranger_count: - cmd_opt: "" # "--jobmode sge --maxjobs=100 --jobinterval=1000" - velocyto_run: - cmd_opt: "--samtools-memory 4000 -@ 4" - -#---------------------------------------- parameters for the jupyter notebook -jupyter_notebook: - snippet_parameters: {} - defaults: {} - report_snippets: - - load_matrix.ipynb - - filter_and_normalize.ipynb - - plot_dim_reduction.ipynb - - ranking_and_marker_genes.ipynb - - compute_velocities.ipynb - -#---------------------------------------- configuration for export -export: - blueprint: - file: SODAR_export_blueprint.txt - command: | - imkdir -p $(dirname {dest} ) - irsync -a -K {src} i:{dest} - path_pattern: - - __SODAR__/{sample}/%Y_%m_%d_{files:cellranger} - - __SODAR__/{sample}/%Y_%m_%d_cellranger/{files:cellranger_logs}.tgz - - __SODAR__/{sample}/%Y_%m_%d_{files:velocyto}/velocyto.{sample}.loom - cellranger: - - dir: {step: cellranger_count, mate: all_mates} - suffix: "cellranger_wd/{sample}/outs" - compress: tar - compress_list: [analysis, raw_feature_bc_matrix, filtered_feature_bc_matrix] - cellranger_logs: - - dir: {step: cellranger_count, mate: all_mates} - suffix: "cellranger_wd/{sample}" - compress: tar - exclude: ["{sample}/outs", "{sample}/SC_RNA_COUNTER_CS"] - velocyto: - - files: {step: velocyto_run, extension: loom, mate: all_mates} diff --git a/defaults/sc_config_ranges.yaml b/defaults/sc_config_ranges.yaml deleted file mode 100644 index ee4b67d..0000000 --- a/defaults/sc_config_ranges.yaml +++ /dev/null @@ -1,47 +0,0 @@ -#---------------------------------------- general pipeline parameters -pipeline_param: - out_path_pattern: \S+ - log_path_pattern: \S+ - in_path_pattern: \S+ - - cellranger_executable: \S+ - feature_ref: .* - test_config: {__opt__: [true, false]} - -#---------------------------------------- organism annotation -organism_defaults: - __opt__: [null, \S*] - -#---------------------------------------- parameters for rules -rule_options: - cellranger_count: - cmd_opt: .* - velocyto_run: - cmd_opt: .* - -#---------------------------------------- configuration for export -export: - blueprint: - __opt__: - - null - - file: \S* - command: .+ - path_pattern: [\S+] - __any__: - - __opt__: - - files: - __any_other__: - step: \S+ - extension: \S+ - log: {__opt__: [true, false]} - - dir: - __any_other__: - step: \S+ - log: {__opt__: [true, false]} - __any_other__: - compress: - __opt__: [zip, tar] - - - - diff --git a/documentation/prepare_input.md b/documentation/prepare_input.md index 26b58f6..6a1991f 100644 --- a/documentation/prepare_input.md +++ b/documentation/prepare_input.md @@ -98,6 +98,8 @@ Feel free to extend the config file and push it back to gitlab. In this way the For an explanation how to export to SODAR see [`SODAR export`](export.md). +**Note: to use sample-specific STAR indices, add entries `star_index: /path/to/star_index/` (careful: must be compatible with STAR version in the pipeline!) or `genome: /path/to/genome.fa` and `gtf: /path/to/gtf` for auto-creation to the `sample_info.yaml`** + --- [Back](../README.md) to main doc. diff --git a/documentation/quickstart.md b/documentation/quickstart.md index 00996d1..37e9212 100644 --- a/documentation/quickstart.md +++ b/documentation/quickstart.md @@ -73,18 +73,19 @@ * edit other options and parameters to your taste, but basically you ready to go. - 4. Generate the file `samples.info` using the command `sea-snap.py sample_info` + 4. Generate the file `sample_info.yaml` using the command `sea-snap.py sample_info` * If you have a custom extension to your read files, then use the `--add_ext` option. * A common problem is that the `in_path_pattern` defined in the `mapping_config.yaml` file does not match the actual file paths. * After successfully running this step, make sure that the - automatically generated `samples.info` file is correct. + automatically generated `sample_info.yaml` file is correct. 5. Run the pipeline with `./sea-snap mapping l` on a local machine or - `./sea-snap mapping c` on the computing cluster. To run on SLURM nodes, - run `./sea-snap mapping --slurm c`. + `./sea-snap mapping c` on the computing cluster. + + To run using SGE scheduler, run `./sea-snap mapping --sge c` (deprecated; check DRMAA config) 6. So, where is the report? Where is the QC? Just take a look at `mapping/multiqc/all_samples.all_mates/out/multiqc.all_samples.all_mates.qc_report.html`, diff --git a/documentation/run_sc.md b/documentation/run_sc.md deleted file mode 100644 index 1e10dc9..0000000 --- a/documentation/run_sc.md +++ /dev/null @@ -1,60 +0,0 @@ -[Back](../README.md) to main doc. - ---- - -Run single cell analysis ------------ - -### 1) download files (from SODAR) - -First, make sure you have the following files accessible (or download them from SODAR): - -- fastq files -- ISA-tab files -- feature reference (if some samples used feature barcoding) - -Note: The ISA-tab assay file should contain the columns **Parameter Value[Library name mRNA]** and **Parameter Value[Library name sample tag]**. -`Library name sample tag` is only filled, when feature barcoding was used for a sample. -Samples with and without feature barcodes can be mixed. - -### 2) edit the config file - -After creating a [working directory](../README.md#running-the-pipeline) - -``` -path/to/sea-snap working_dir -``` - -edit the config file: - -``` -vim sc_config.yaml -``` - -Set the **in_path_pattern** to the fastq files and the **transcriptome** as well as **gtf** files for [CellRanger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/installation) that are in the reference package. -If the experiment used feature barcoding, set the path to the [**Feature Reference CSV**](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis). - -### 3) create a sample info file - -This extracts information from the fastq file paths about the samples using the [in_path_pattern](prepare_input.md). - -The ISA-tab assay file can also be used to extract meta-information about the samples. -It should contain columns **Parameter Value[Library name mRNA]** and **Parameter Value[Library name sample tag]**. - -``` -./sea-snap sample_info --from sodar --input --config_files sc_config.yaml -``` - -### 4) run the pipeline - -``` -./sea-snap sc --slurm c -``` - -### 5) create a jupyter notebook from snippets - -(this is under development; feel free to edit and add snippets) - -``` -./sea-snap sc l create_ipynb -``` \ No newline at end of file diff --git a/external_scripts/tmod_functions.R b/external_scripts/tmod_functions.R index 3fd6f83..cb2220a 100644 --- a/external_scripts/tmod_functions.R +++ b/external_scripts/tmod_functions.R @@ -4,7 +4,8 @@ library(glue) #' Volcano plots with ggplot ggvolcano <- function(lfc, pvals, symbol=NULL, xlim=NULL, ylim=NULL, pval.thr=.05, lfc.thr=1) { - require(tidyverse) + require(dplyr) + require(purrr) require(ggplot2) p.t <- pval.thr @@ -52,7 +53,8 @@ tmod_filter_format_res <- function(res, pval.thr=.05, AUC.thr=.65) { if(is.null(res)) { return(res) } - require(tidyverse) + require(dplyr) + require(purrr) require(DT) res <- res %>% diff --git a/install_r_packages.R b/install_r_packages.R new file mode 100644 index 0000000..af7270f --- /dev/null +++ b/install_r_packages.R @@ -0,0 +1 @@ +remotes::install_github("bihealth/orthomapper") diff --git a/mapping_pipeline.snake b/mapping_pipeline.snake index 388cf0c..121a109 100644 --- a/mapping_pipeline.snake +++ b/mapping_pipeline.snake @@ -72,9 +72,8 @@ def get_inputs_all(): # mapping results mapping_choices = config["pipeline_param"]["mapping_results"] if "star-gene_counts" in mapping_choices: - inputs += pph.expand_path("star", "bam", fix="!sample") + inputs += pph.expand_path("star", "bam", fix="!sample") inputs += pph.expand_path("feature_counts", "feature_counts", fix="!sample") - inputs += pph.expand_path("tpm_calculator", "tsv", fix="!sample") if "salmon-transcript_counts" in mapping_choices: inputs += pph.expand_path("salmon", "sf", fix="!sample") if "ciri-circRNA" in mapping_choices: @@ -88,9 +87,9 @@ def get_inputs_all(): if "fastqc" in qc_choices: inputs += pph.expand_path("fastqc", "zip_names.txt") if "star-gene_counts" in mapping_choices: - if "dupradar" in qc_choices: + if "dupradar" in qc_choices: inputs += pph.expand_path("dupradar", "done", fix="!sample") - if "rna_seqc" in qc_choices: + if "rna_seqc" in qc_choices: inputs += pph.expand_path("rna_seqc", "done", fix="!sample") if "preseq_lc_extrap" in qc_choices: inputs += pph.expand_path("preseq_lc_extrap", "future_yield.txt", fix="!sample") @@ -102,7 +101,7 @@ def get_inputs_all(): inputs += pph.expand_path("qualimap_bamqc", "done", fix="!sample") if "infer_experiment" in qc_choices: inputs += pph.expand_path("infer_experiment", "strand_stat.txt", fix="!sample") - if "qc" in qc_choices: + if "qc" in qc_choices: inputs += pph.expand_path("qc", "done", fix="!sample") if "bw_from_bam" in qc_choices: inputs += pph.expand_path("bw_from_bam", "bw", fix="!sample") @@ -129,9 +128,9 @@ rule all: quality_control = "".join(["- {}\n".format(qc) for qc in config["pipeline_param"]["QC_results"]]) loctime = asctime(localtime(time())) rule_execution = pph.file_path("pipeline_report", "rule_execution.png", fix="all") - summary = pph.file_path("pipeline_report", "summary.csv", fix="all") - version_info = pph.file_path("pipeline_report", "version_info.txt", fix="all") - conda_info = pph.file_path("pipeline_report", "conda_info.txt", fix="all") + summary = pph.file_path("pipeline_report", "summary.csv", fix="all") + version_info = pph.file_path("pipeline_report", "version_info.txt", fix="all") + conda_info = pph.file_path("pipeline_report", "conda_info.txt", fix="all") dag = rule_execution.split("/")[-1] shell("conda list > {}".format(version_info)) shell("conda info > {}".format(conda_info)) @@ -198,9 +197,11 @@ rule fastqc: log: out = pph.file_path(step="fastqc", extension="output.log", log=True) params: - outdir = pph.out_dir_name(step="fastqc") - threads: - 2 + outdir = pph.out_dir_name(step="fastqc") + threads: 2 + resources: + mem_mb=4000, + h_rt="24:00:00" run: reads = " ".join(input) @@ -225,8 +226,8 @@ rule fastqc: rule star_index: input: - genome = config["organism"]["files"]["genome"], - gtf = config["organism"]["files"]["gtf"] + genome = lambda wildcards: config["organism"]["files"]["genome"] if wildcards.sample=="all_samples" else config["sample_info"][wildcards.sample]["genome"], + gtf = lambda wildcards: config["organism"]["files"]["gtf"] if wildcards.sample=="all_samples" else config["sample_info"][wildcards.sample]["gtf"] output: index = pph.out_dir_name(step = "star_index")+"/SA" log: @@ -234,6 +235,9 @@ rule star_index: params: options = config["rule_options"]["star_index"]["cmd_opt"] threads: 8 + resources: + mem_mb=64000, + h_rt="24:00:00" run: script = textwrap.dedent(r""" #----- prepare @@ -249,15 +253,35 @@ rule star_index: script_file = pph.log(log.out, snakemake_format(script), step="star_index", extension="sh", **wildcards) shell("bash '{script_file}' &>> '{log.out}'") + +def get_star_index (wildcards): + """get star index or compute one for all or specific samples""" + # if there's a sample-specific star index (precomputed) + if "star_index" in config["sample_info"][wildcards.sample]: + #print("using pre-computed star index for "+wildcards.sample) + return config["sample_info"][wildcards.sample]["star_index"] + "/SA" + # if there's a sample-specific genome and gtf + elif "genome" in config["sample_info"][wildcards.sample] and "gtf" in config["sample_info"][wildcards.sample]: + #print("using newly computed star index for "+wildcards.sample) + return pph.out_dir_name(step = "star_index", fix="!sample", sample = wildcards.sample) + "/SA" + # otherwise + else: + if "star_index" in config["organism"] and config["organism"]["star_index"] is not None: + #print("using pre-computed global star index for "+wildcards.sample) + return config["organism"]["star_index"] + "/SA" + else: + #print("using newly computed global star index for "+wildcards.sample) + return pph.out_dir_name(step = "star_index", fix="all") + "/SA" + rule star: input: - index = pph.out_dir_name(step = "star_index", fix="all")+"/SA", + index = get_star_index, reads1 = lambda wildcards: pph.get_fastq_pairs(wildcards, mate_key="paired_end_extensions", mate=0), reads2 = lambda wildcards: pph.get_fastq_pairs(wildcards, mate_key="paired_end_extensions", mate=1) #identical if unstranded output: - bam = pph.file_path(step="star", extension="bam"), + bam = pph.file_path(step="star", extension="bam"), usrt_bam = temp(pph.file_path(step="star", extension="unsorted.bam")), - gene_counts = pph.file_path(step="star", extension="gene_counts.tab") + gene_counts = pph.file_path(step="star", extension="gene_counts.tab") log: out = pph.file_path(step="star", extension="output.log", log=True) params: @@ -265,6 +289,9 @@ rule star: options = config["rule_options"]["star"]["cmd_opt"], trim = True if config["rule_options"]["star"]["trim"]=="yes" else False threads: 16 + resources: + mem_mb=48000, + h_rt="24:00:00" run: # prepare read input reads = ",".join(input.reads1) @@ -314,7 +341,7 @@ rule star: mv {params.outdir}/Aligned.out.bam {output.usrt_bam} #----- mark duplicates - picard MarkDuplicates I={output.bam} O={params.outdir}/{wildcards.sample}.mdup.bam M={params.outdir}/{wildcards.sample}.metrics.out TMP_DIR={params.outdir} + picard -Xmx48G MarkDuplicates I={output.bam} O={params.outdir}/{wildcards.sample}.mdup.bam M={params.outdir}/{wildcards.sample}.metrics.out TMP_DIR={params.outdir} mv {params.outdir}/{wildcards.sample}.mdup.bam {output.bam} #----- generate BAM index (.bai) @@ -339,6 +366,9 @@ rule bw_from_bam: pph.file_path(step="bw_from_bam", extension="bw") log: out = pph.file_path(step="bw_from_bam", extension="output.log", log=True) + resources: + mem_mb=32000, + h_rt="4:00:00" run: genome_size=config["organism"]["genome_size"] script = textwrap.dedent(r""" @@ -367,6 +397,9 @@ rule macs2: pn=pph.file_path(step="macs2", extension="peaks.narrowPeak") log: out = pph.file_path(step="macs2", extension="output.log", log=True) + resources: + mem_mb=32000, + h_rt="24:00:00" run: genome_abbr=config["organism"]["abbreviation"] script = textwrap.dedent(r""" @@ -389,35 +422,6 @@ rule macs2: - - - - -rule tpm_calculator: - input: - bam = pph.file_path(step="star", extension="bam"), - gtf = config["organism"]["files"]["gtf"] - output: - pph.file_path(step="tpm_calculator", extension="tsv") - log: - out = pph.file_path(step="tpm_calculator", extension="output.log", log=True) - params: - nsrt_bam = pph.file_path(step="tpm_calculator", extension="name_sorted.bam") - run: - script = textwrap.dedent(r""" - #----- prepare - set -eux - TPMCalculator -version || true - - #----- run TPMCalculator - TPMCalculator -g {input.gtf} -b {input.bam} - cut -f1,7 $(basename {input.bam} .bam)_genes.out > {output} - mv $(basename {input.bam} .bam)_genes.* $(dirname {output}) - """) - - script_file = pph.log(log.out, snakemake_format(script), step="tpm_calculator", extension="sh", **wildcards) - shell("bash '{script_file}' &>> '{log.out}'") - ##-------------------- bwa --------------------------------------------------------------------------------- GENOME_FILE_NAME = os.path.basename(config["organism"]["files"]["genome"]) @@ -433,6 +437,9 @@ rule bwa_index: out = pph.file_path(step="bwa_index", extension="output.log", log=True) params: options = config["rule_options"]["bwa_index"]["cmd_opt"] + resources: + mem_mb=32000, + h_rt="24:00:00" run: db_prefix = os.path.join(output.index, GENOME_FILE_NAME) @@ -468,6 +475,9 @@ rule bwa: trim = True if config["rule_options"]["bwa"]["trim"]=="yes" else False, run_stats = "true" if config["rule_options"]["bwa"]["run_stats"] else "false" threads: 8 + resources: + mem_mb=32000, + h_rt="24:00:00" run: db_prefix = os.path.join(params.index_dir, GENOME_FILE_NAME) @@ -530,6 +540,9 @@ rule ciri: sam = pph.file_path(step="ciri", extension="sam"), options = config["rule_options"]["ciri"]["cmd_opt"] threads: 8 + resources: + mem_mb=32000, + h_rt="24:00:00" run: samtools_threads = threads//2 @@ -560,6 +573,9 @@ rule generate_transcriptome: log: out = pph.file_path(step = "generate_transcriptome", extension="output.log", log=True) threads: 8 + resources: + mem_mb=32000, + h_rt="24:00:00" run: script = textwrap.dedent(r""" #----- prepare @@ -577,13 +593,16 @@ rule generate_transcriptome: rule salmon_index: input: transcriptome = pph.out_dir_name(step = "generate_transcriptome")+"/transcriptome.fa", - genome = config["organism"]["files"]["genome"] + genome = config["organism"]["files"]["genome"] output: index = pph.out_dir_name(step = "salmon_index")+"/pos.bin" log: out = pph.file_path(step = "salmon_index", extension="output.log", log=True) params: options = config["rule_options"]["salmon_index"]["cmd_opt"] + resources: + mem_mb=32000, + h_rt="24:00:00" run: script = textwrap.dedent(r""" #----- prepare @@ -622,6 +641,9 @@ rule salmon: options = config["rule_options"]["salmon"]["cmd_opt"], trim = True if config["rule_options"]["salmon"]["trim"]=="yes" else False threads: 8 + resources: + mem_mb=32000, + h_rt="24:00:00" run: # prepare read input and trimming unpaired = input.reads1 == input.reads2 @@ -664,11 +686,14 @@ rule feature_counts: out = pph.file_path(step="feature_counts", extension="output.log", log=True) params: nsrt_bam = pph.file_path(step="feature_counts", extension="name_sorted.bam"), - options = config["rule_options"]["feature_counts"]["cmd_opt"] + options = config["rule_options"]["feature_counts"]["cmd_opt"] threads: 2 + resources: + mem_mb=32000, + h_rt="24:00:00" run: protocol = {"unstranded":0, "forward":1, "reverse":2}[config["sample_info"][wildcards.sample]["stranded"]] - paired = "-p" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" + paired = "-p" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" script = textwrap.dedent(r""" #----- prepare @@ -699,9 +724,12 @@ rule dupradar: log: out = pph.file_path(step="dupradar", extension="output.log", log=True) threads: 8 + resources: + mem_mb=32000, + h_rt="24:00:00" run: protocol = {"unstranded":0, "forward":1, "reverse":2}[config["sample_info"][wildcards.sample]["stranded"]] - paired = "TRUE" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "FALSE" + paired = "TRUE" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "FALSE" script = textwrap.dedent(r""" #----- prepare @@ -733,6 +761,9 @@ rule infer_experiment: pph.file_path(step="infer_experiment", extension="strand_stat.txt") log: out = pph.file_path(step="infer_experiment", extension="output.log", log=True) + resources: + mem_mb=32000, + h_rt="24:00:00" run: script = textwrap.dedent(r""" #----- prepare @@ -754,8 +785,11 @@ rule preseq_lc_extrap: pph.file_path(step="preseq_lc_extrap", extension="future_yield.txt") log: out = pph.file_path(step="preseq_lc_extrap", extension="output.log", log=True) + resources: + mem_mb=32000, + h_rt="24:00:00" run: - paired = "-P -l 10000000" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" + paired = "-P" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" script = textwrap.dedent(r""" #----- prepare @@ -776,8 +810,11 @@ rule preseq_c_curve: pph.file_path(step="preseq_c_curve", extension="complexity.txt") log: out = pph.file_path(step="preseq_c_curve", extension="output.log", log=True) + resources: + mem_mb=32000, + h_rt="24:00:00" run: - paired = "-P -l 10000000" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" + paired = "-P" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" script = textwrap.dedent(r""" #----- prepare @@ -803,10 +840,11 @@ rule qualimap_rnaseq: log: out = pph.file_path(step="qualimap_rnaseq", extension="output.log", log=True) resources: - mem_mb = 24000 + mem_mb=32000, + h_rt="24:00:00" run: protocol = {"unstranded":"non-strand-specific", "forward":"strand-specific-forward", "reverse":"strand-specific-reverse"}[config["sample_info"][wildcards.sample]["stranded"]] - paired = "-pe" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" + paired = "-pe" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "" script = textwrap.dedent(r""" #----- prepare @@ -834,7 +872,8 @@ rule qualimap_bamqc: log: out = pph.file_path(step="qualimap_bamqc", extension="output.log", log=True) resources: - mem_mb = 24000 + mem_mb=32000, + h_rt="24:00:00" run: protocol = {"unstranded":"non-strand-specific", "forward":"strand-specific-forward", "reverse":"strand-specific-reverse"}[config["sample_info"][wildcards.sample]["stranded"]] @@ -865,6 +904,9 @@ rule collapse_annotation: pph.file_path(step="collapse_annotation", extension="genes.gtf") log: out = pph.file_path(step="collapse_annotation", extension="output.log", log=True) + resources: + mem_mb=32000, + h_rt="24:00:00" run: script = textwrap.dedent(r""" #----- prepare @@ -885,11 +927,14 @@ rule rna_seqc: touch(pph.file_path(step="rna_seqc", extension="done")) log: out = pph.file_path(step="rna_seqc", extension="output.log", log=True) + resources: + mem_mb=32000, + h_rt="24:00:00" params: options = config["rule_options"]["rna_seqc"]["cmd_opt"] run: protocol = {"unstranded":"", "forward":"--stranded=FR", "reverse":"--stranded=RF"}[config["sample_info"][wildcards.sample]["stranded"]] - paired = "" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "--unpaired" + paired = "" if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else "--unpaired" script = textwrap.dedent(r""" #----- prepare @@ -918,6 +963,9 @@ rule qc: touch(pph.file_path(step="qc", extension="done")) log: out = pph.file_path(step="qc", extension="output.log", log=True) + resources: + mem_mb=4000, + h_rt="4:00:00" run: paired = '' if len(config["sample_info"][wildcards.sample]["paired_end_extensions"])>1 else '-e' @@ -942,8 +990,11 @@ rule multiqc: output: pph.file_path(step="multiqc", extension="qc_report.html") log: - out = pph.file_path(step="multiqc", extension="output.log", log=True), + out = pph.file_path(step="multiqc", extension="output.log", log=True), multiqc_config = pph.file_path(step="multiqc", extension="multiqc_conf.yaml", log=True) + resources: + mem_mb=4000, + h_rt="4:00:00" run: # compile multiqc config multiqc_dict = config["report"]["multiqc"] @@ -978,6 +1029,9 @@ rule circRNA_report: get_inputs_all() output: pph.file_path("circRNA_report", "Rmd", sample="all_samples", flowcell="all_flowcells", lane="all_lanes", mate="all_mates", library="all_libraries") + resources: + mem_mb=4000, + h_rt="4:00:00" run: rt = ReportTool(pph, profile="circRNA") report_text = rt.generate_report() @@ -989,7 +1043,7 @@ rule circRNA_report: pph.log_generated_files(save_to=file_table, path_pattern=path) id_suffix, _ = rt.get_id_suffix(tag, num) text_sub["file_tab"] += (f'file_tab{id_suffix} <- data.frame(read.table("{file_table}", sep="\\t", header=TRUE))\n' - f'file_tab{id_suffix}$filename <- as.character(file_tab{id_suffix}$filename)\n') + f'file_tab{id_suffix}$filename <- as.character(file_tab{id_suffix}$filename)\n') text_sub["config"] += f'config{id_suffix} <- yaml.load_file("{config_file}")\n' report_text = report_text.replace("{{WORKING_DIRECTORY}}", os.getcwd() + os.sep) diff --git a/sc_config.yaml b/sc_config.yaml deleted file mode 100644 index e73c3c3..0000000 --- a/sc_config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -#--- organism annotation - -organism_defaults: ### FILL HERE ### (e.g. Homo_sapiens.yaml) - -organism: - __options__: [name, genus, taxon, files, star_index, salmon_index, R] - ### OVERWRITE ### organism defaults (e.g. gtf, genome and indices) - files: - cellranger_transcriptome: "" ### FILL HERE ### (the transcriptome to use with CellRanger) - cellranger_gtf: "" ### FILL HERE ### (the gtf to use with CellRanger) - - -#--- general pipeline parameters - -pipeline_param: - __options__: [out_path_pattern, log_path_pattern, in_path_pattern, feature_ref] - - in_path_pattern: ### FILL HERE ### (path pattern where fastq files are stored) - - feature_ref: "" ### FILL HERE ### (if feature barcoding was used) - - -#--- parameters for rules - -rule_options: - __options__: [cellranger_count, velocyto_run] - - -#--- parameters for the jupyter notebook -jupyter_notebook: - __options__: [snippet_parameters, defaults, report_snippets] \ No newline at end of file diff --git a/sc_pipeline.snake b/sc_pipeline.snake deleted file mode 100644 index e08fac7..0000000 --- a/sc_pipeline.snake +++ /dev/null @@ -1,319 +0,0 @@ -## SeA-SnaP single cell pipeline for RNA-seq analysis -## version: 0.1 -## author: J.P.Pett (patrick.pett@bihealth.de) - -import os, sys, yaml, re, textwrap, pandas as pd -import tools.pipeline_tools -from collections import OrderedDict -from time import asctime, localtime, time -from pathlib import Path -from snakemake.utils import report, format as snakemake_format, min_version -from snakemake.logging import logger -from tools.pipeline_tools import MappingPipelinePathHandler, ReportTool - -min_version("3.7") -shell.prefix("set -e pipefail;") -tools.pipeline_tools.warnings.simplefilter("always") -yaml.add_representer(OrderedDict, lambda dumper, data: dumper.represent_dict(dict(data))) - -# source files -SNAKEDIR = Path(workflow.current_basedir) -SNAKEFILE = workflow.snakefile -SCRIPTDIR = str(SNAKEDIR / "external_scripts") - -# assemble config -config_file_name = config["file_name"] if "file_name" in config else "sc_config.yaml" -configfile: str(SNAKEDIR / "defaults" / "sc_config_defaults.yaml") -configfile: config_file_name -configfile: "sample_info.yaml" -if config["organism_defaults"]: - configfile: str(SNAKEDIR / "defaults" / config["organism_defaults"]) - configfile: config_file_name - -# create path handler -conf_ranges = str(SNAKEDIR / "defaults" / "sc_config_ranges.yaml") -test_config = conf_ranges if config["pipeline_param"]["test_config"] else None -pph = MappingPipelinePathHandler(workflow, test_config) - -# link indices -pph.link_index(step="cellranger_count", fix="all", subdir="outdir", add_done=True) - -# exclude symbols '.' and '/' from wildcards -wildcard_constraints: - sample="[^./]+", - mate ="[^./]+" - -onstart: - # draw a dag - dag_file = pph.file_path(step="pipeline_report", extension="rule_execution.png", fix="all") - os.makedirs(os.path.dirname(dag_file), exist_ok=True) - shell("snakemake --quiet --snakefile {} --rulegraph | dot -Tpng > {}".format(SNAKEFILE, dag_file)) - # info about the pipeline run - info_file = pph.file_path(step="pipeline_report", extension="summary.csv", fix="all") - os.makedirs(os.path.dirname(info_file), exist_ok=True) - shell("snakemake --quiet --snakefile {} --summary | sed 's/\t/, /g' > {}".format(SNAKEFILE, info_file)) - # save merged config - config_file = pph.file_path(step="pipeline_report", extension="yaml", fix="all") - with open(config_file, "w") as f: yaml.dump(config, f, default_flow_style=False) - -##-------------------- starting point ---------------------------------------------------------------------- - -def get_inputs_all(): - inputs = [] - - results = config["pipeline_param"]["produce_results"] - if "velocyto" in results: - if config["pipeline_param"]["input_choice"]["velocyto"][0] == "cellranger_aggr": - inputs += [pph.file_path("velocyto_run", "loom", fix="all")] - else: - inputs += pph.expand_path(step="velocyto_run", extension="loom", fix="!sample") - elif "cellranger-aggr" in results: - inputs += [pph.file_path(step="cellranger_aggr", extension="done", fix="all")] - elif "cellranger" in results: - inputs += pph.expand_path(step="cellranger_count", extension="done", fix="!sample") - - return inputs - - -shell("rm -f {}".format(pph.file_path(step="pipeline_report", extension="report.html", fix="all"))) - -rule all: - input: - get_inputs_all() - output: - html = pph.file_path(step="pipeline_report", extension="report.html", fix="all") - run: - loctime = asctime(localtime(time())) - rule_execution = pph.file_path("pipeline_report", "rule_execution.png", fix="all") - summary = pph.file_path("pipeline_report", "summary.csv", fix="all") - version_info = pph.file_path("pipeline_report", "version_info.txt", fix="all") - conda_info = pph.file_path("pipeline_report", "conda_info.txt", fix="all") - dag = rule_execution.split("/")[-1] - shell("conda list > {}".format(version_info)) - shell("conda info > {}".format(conda_info)) - report(""" - =========================== - RNAseq single cell pipeline - =========================== - - **Finished: {loctime}** - - .. image:: {dag} - - File status at pipeline start: - ============================== - - .. csv-table:: - :file: {summary} - - Version info: - ============= - - .. include:: {version_info} - :literal: - - Conda info: - =========== - - .. include:: {conda_info} - :literal: - - """, output.html, graph = rule_execution, table = summary) - -rule export: - input: - get_inputs_all() - run: - pph.export() - -##-------------------- CellRanger ------------------------------------------------------------------------------- - -rule cellranger_count: - """ run cellranger count """ - input: - reads = lambda wildcards: pph.get_fastq_pairs(wildcards, mate="*"), - transcriptome = config["organism"]["files"]["cellranger_transcriptome"] - output: - outdir = directory(pph.out_dir_name(step="cellranger_count")+"/cellranger_wd"), - links = directory(pph.out_dir_name(step="cellranger_count")+"/input_links"), - done = touch(pph.file_path(step="cellranger_count", extension="done")) - log: - out = pph.file_path(step = "cellranger_count", extension="output.log", log=True) - params: - options = config["rule_options"]["cellranger_count"]["cmd_opt"], - feature_ref = config["pipeline_param"]["feature_ref"], - cellranger_exec = config["pipeline_param"]["cellranger_executable"] - run: - # using feature barcodes ? - feature_ref = str(Path(params.feature_ref).resolve()) if params.feature_ref else "" - - # create libraries file - lib_file_content = ["fastqs, sample, library_type"] - prefixes = [p for p, t in config["sample_info"][wildcards.sample]["lib_types"]] - pref_dup = len(prefixes) != len(set(prefixes)) - for lpref, ltype in config["sample_info"][wildcards.sample]["lib_types"]: - for lp in lpref.split(","): - # in some cases the same prefix is annotated in ISA tab for feature and GEX - # in this case use the trick of duplicating links to fastqs and - # adding tag_ to the prefix - if pref_dup and ltype != "Gene Expression": - lib_file_content.append(f"{Path(output.links).resolve()}, tag_{lp}, {ltype}") - else: - lib_file_content.append(f"{Path(output.links).resolve()}, {lp}, {ltype}") - pref_dup = "true" if pref_dup else "false" - lib_file_content = "\n".join(lib_file_content) - - script = textwrap.dedent(r""" - #----- prepare - set -eux - {params.cellranger_exec} --version || true - - #----- collect fastq input links - mkdir {output.links} - ln -sf $(readlink -f {input.reads}) {output.links} - # if names are the same for feature and GEX fastqs, - # duplicate links with different name: - if {pref_dup}; then - for f in {output.links}/*; do - cp -av "$f" "$(dirname $f)/tag_$(basename $f)" - done - fi - - #----- Cellranger counting - mkdir -p {output.outdir}; cd {output.outdir}; ls -lh - #-- write libraries file - cat << 'EOF' > "libraries.csv" - {lib_file_content} - EOF - #-- link feature barcodes if present - feature_ref="{feature_ref}" - if [ "$feature_ref" ]; then - ln -sf $feature_ref "feature_ref.csv" - feature_ref="--feature-ref=feature_ref.csv" - fi - #-- run cellranger count - {params.cellranger_exec} count --id="{wildcards.sample}" --libraries="libraries.csv" $feature_ref --transcriptome={input.transcriptome} {params.options} - cd - - """) - - script_file = pph.log(log.out, snakemake_format(script), step="cellranger_count", extension="sh", **wildcards) - shell("bash '{script_file}' &>> '{log.out}'") - - -rule cellranger_aggr: - input: - pph.expand_path(step="cellranger_count", extension="done", fix="mate") - output: - outdir = directory(pph.out_dir_name(step="cellranger_aggr")+"/cellranger_wd"), - done = touch(pph.file_path(step="cellranger_aggr", extension="done")) - log: - out = pph.file_path(step = "cellranger_aggr", extension="output.log", log=True) - params: - cellranger_exec = config["pipeline_param"]["cellranger_executable"] - run: - aggr_csv_content = ["library_id, molecule_h5"] - for count_results in input: - sample_name = pph.wildcard_values_from(count_results, False)["sample"][0] - mol_info = Path(count_results).parent / "cellranger_wd" / sample_name / "outs" / "molecule_info.h5" - aggr_csv_content.append(f"{sample_name}, {str(mol_info.resolve())}") - aggr_csv_content = "\n".join(aggr_csv_content) - - script = textwrap.dedent(r""" - #----- prepare - set -eux - {params.cellranger_exec} --version || true - - #----- Cellranger counting - mkdir -p {output.outdir}; cd {output.outdir} - #-- write libraries file - cat << 'EOF' > "output_aggr.csv" - {aggr_csv_content} - EOF - #-- run cellranger aggr - {params.cellranger_exec} aggr --id="cellranger_dir" --csv="output_aggr.csv" - cd - - """) - - script_file = pph.log(log.out, snakemake_format(script), step="cellranger_aggr", extension="sh", **wildcards) - shell("bash '{script_file}' &>> '{log.out}'") - -##-------------------- Velocyto ------------------------------------------------------------------------------- - -rule velocyto_run: - """ run velocyto on 10X Chromium samples """ - input: - gtf = config["organism"]["files"]["cellranger_gtf"], - cellranger_done = pph.choose_input( - choice_name = "velocyto", - options = [ - dict(step = "cellranger_aggr", extension = "done"), - dict(step = "cellranger_count", extension = "done") - ] - ) - output: - pph.file_path(step="velocyto_run", extension="loom") - log: - out = pph.file_path(step="velocyto_run", extension="output.log", log=True) - params: - cellranger_outdir = pph.choose_input( - choice_name = "velocyto", - func = lambda **kw: os.path.join(pph.out_dir_name(**kw),"cellranger_wd",kw["sample"],"outs"), - options = [ - dict(step = "cellranger_aggr"), - dict(step = "cellranger_count") - ] - ), - mask = config["organism"]["files"]["velocyto_mask_gtf"], - options = config["rule_options"]["velocyto_run"]["cmd_opt"] - run: - mask = f"-m {params.mask}" if params.mask else "" - - script = textwrap.dedent(r""" - #----- prepare - set -eux - velocyto --version - - #----- run velocyto - #velocyto run10x \ - #{params.cellranger_outdir} \ - #{input.gtf} \ - #{params.options} - velocyto run -b {params.cellranger_outdir}/filtered_feature_bc_matrix/barcodes.tsv.gz {mask} -o $(dirname {output}) {params.options} {params.cellranger_outdir}/possorted_genome_bam.bam {input.gtf} - - #----- move results - mv $(dirname {output})/*.loom {output} - """) - - script_file = pph.log(log.out, snakemake_format(script), step="velocyto_run", extension="sh", **wildcards) - shell("bash '{script_file}' &>> '{log.out}'") - -##-------------------- jupyter nb ------------------------------------------------------------------------------- - -rule create_ipynb: - """ compile a jupyter notebook for second line analysis """ - input: - get_inputs_all() - output: - pph.file_path("create_ipynb", "ipynb", fix="all") - run: - rt = ReportTool(pph, profile="sc_analysis") - report_text = rt.generate_report() - - text_sub = dict(file_tab="", config="") - for tag, num, path in ((tag, num, path) for tag, paths in rt.use_results.items() for num, path in enumerate(paths)): - file_table = pph.file_path("create_ipynb", "tsv", fix="all", path_pattern=path) - config_file = pph.file_path("pipeline_report", "yaml", fix="all", path_pattern=path) - pph.log_generated_files(save_to=file_table, path_pattern=path) - - id_suffix, _ = rt.get_id_suffix(tag, num) - text_sub["file_tab"] += f'file_tab{id_suffix} = pd.read_csv("{file_table}", sep="\\t")\n' - text_sub["config"] += ( - f'with open("{config_file}", "r") as stream:\n' - f'\tconfig{id_suffix} = yaml.safe_load(stream)\n' - ) - - report_text = report_text.replace("{{WORKING_DIRECTORY}}", os.getcwd() + os.sep) - report_text = report_text.replace("{{LOAD_FILE_TABLE}}", text_sub["file_tab"]) - report_text = report_text.replace("{{LOAD_CONFIG_FILE}}", text_sub["config"]) - - with open(output[0], "w") as f: f.write(report_text) \ No newline at end of file diff --git a/sea-snap.py b/sea-snap.py index 3114e07..8c96cff 100755 --- a/sea-snap.py +++ b/sea-snap.py @@ -14,7 +14,6 @@ CONFIGS = dict( DE="DE_config.yaml", mapping="mapping_config.yaml", - sc="sc_config.yaml", ) CLUSTER_CONFIG = dict( @@ -171,7 +170,7 @@ def rmdir(dir_name): item.unlink() dir_path.rmdir() - rmdir("cluster_log") + rmdir("logs") for p in Path(".").glob("temp_snakemake*.sh"): p.unlink() for p in Path(".").glob("run_pipeline.sh"): p.unlink() for p in Path(".").glob("core.*"): p.unlink() @@ -196,7 +195,7 @@ def run_pipeline(snakefile, args): method = args.submit with open(CLUSTER_CONFIG[method], "r") as json_file: data = json.load(json_file) - Path("cluster_log").mkdir(exist_ok=True) + #Path("cluster_log").mkdir(exist_ok=True) run_script = Path("run_pipeline.sh") s_command = "#!/bin/bash\nsnakemake --snakefile {sfile}".format(sfile = str(SCRIPT_DIR / snakefile)) if args.snake_options: s_command += " " + " ".join(args.snake_options) @@ -223,13 +222,6 @@ def run_DE_pipeline(args): run_pipeline("DE_pipeline.snake", args) -def run_sc_pipeline(args): - """ - run single cell pipeline - """ - run_pipeline("sc_pipeline.snake", args) - - ############################## DEFINE PARSER parser = argparse.ArgumentParser(description="run SeA-SnaP pipelines and helpers") @@ -240,7 +232,7 @@ def run_sc_pipeline(args): #--- parser for setup_working_directory parser_working_dir = subparsers.add_parser('working_dir', help="setup a working directory for running the pipeline") parser_working_dir.add_argument('--dirname', '-d', default="results_%Y_%m_%d/", help="name of directory") -parser_working_dir.add_argument('--configs', '-c', nargs='+', default=["mapping", "DE", "sc"], choices=["mapping", "DE", "sc"], help="configs to be imported") +parser_working_dir.add_argument('--configs', '-c', nargs='+', default=["mapping", "DE"], choices=["mapping", "DE"], help="configs to be imported") parser_working_dir.set_defaults(func=setup_working_directory) #--- parser for generate_sample_info @@ -308,7 +300,6 @@ def run_sc_pipeline(args): parser_cleanup_log = subparsers.add_parser('cleanup_log', help="delete log files from cluster execution") parser_cleanup_log.set_defaults(func=cleanup_cluster_log) - ############################## PARSE ARGUMENTS if len(sys.argv)==1: diff --git a/tools/ISAtab_parse_conf.yaml b/tools/ISAtab_parse_conf.yaml index a9d3575..f13819a 100644 --- a/tools/ISAtab_parse_conf.yaml +++ b/tools/ISAtab_parse_conf.yaml @@ -29,10 +29,3 @@ instrument: value: ".*illumina.*": "\\g<0>" -lib_types: - columns: - - "Parameter Value\\[Library name mRNA\\]" - - "Parameter Value\\[Library name sample tag\\]" - value: - ".*": {"\\g<0>": "\\col_mode: Parameter Value[Library name mRNA]:Gene Expression,Parameter Value[Library name sample tag]:Antibody Capture"} - add: true \ No newline at end of file diff --git a/tools/pipeline_tools.py b/tools/pipeline_tools.py index c03274a..b96e805 100644 --- a/tools/pipeline_tools.py +++ b/tools/pipeline_tools.py @@ -1445,7 +1445,6 @@ def update_sample_info(self, library_default="unstranded", add=False): if comb.sample not in sample_info: sample_info[comb.sample] = {"stranded": library_default, "read_extension": comb.read_extension} sample_info[comb.sample]["paired_end_extensions"] = [getattr(comb, "mate", "")] - sample_info[comb.sample]["lib_types"] = {comb.lib_type: None} if hasattr(comb, "lib_type") else {} else: if hasattr(comb, "mate"): paired_end_ext = getattr(comb, "mate", "") @@ -1457,14 +1456,6 @@ def update_sample_info(self, library_default="unstranded", add=False): if paired_end_ext not in paired_end_ext_lst: paired_end_ext_lst.append(paired_end_ext) paired_end_ext_lst.sort() - if hasattr(comb, "lib_type"): - lib_types = sample_info[comb.sample]["lib_types"] - if lib_types == {}: - raise ValueError( - "Error compiling sample information: sample {} has names with and without library type information".format( - comb.lib_type)) - if comb.lib_type not in lib_types: - lib_types[comb.lib_type] = None if add: # add missing fields self._add_info_fields(sample_info)