From d8977fe8e396a9b091ec7947fe7c311628da623f Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Mon, 4 Dec 2023 10:44:35 +0000 Subject: [PATCH] merge blacklist fix, dump config fix, assertion to check labels.tsv x selected/ x scNOVA input lists, labels at later stage to prevent working on modified list of cells, other minor fixes --- .gitignore | 4 +- .tests/config/simple_config.yaml | 7 +- afac/update_timestamps.py | 25 + config/config.yaml | 3 + config/config_metadata.yaml | 6 + .../Dockerfile-2.2.2.dockerfile | 227 +++++++++ .../Dockerfile-2.2.3.dockerfile | 299 ++++++++++++ .../add_T2T_part_to_Dockerfile.sh | 35 ++ watchdog_pipeline/watchdog_pipeline.py | 13 +- workflow/Snakefile | 26 +- workflow/envs/scNOVA/scNOVA_DL.yaml | 1 + workflow/rules/aggregate_fct.smk | 2 +- workflow/rules/common.smk | 450 +++++------------- workflow/rules/count.smk | 3 +- workflow/rules/plots.smk | 7 +- workflow/rules/regenotyping.smk | 1 + workflow/rules/scNOVA.smk | 19 + workflow/rules/utils.smk | 15 + .../scripts/normalization/merge-blacklist.py | 3 +- .../scNOVA_scripts/assert_list_of_cells.py | 57 +++ workflow/scripts/utils/dump_config.py | 40 +- 21 files changed, 857 insertions(+), 386 deletions(-) create mode 100644 afac/update_timestamps.py create mode 100644 github-actions-runner/Dockerfile-2.2.2.dockerfile create mode 100644 github-actions-runner/Dockerfile-2.2.3.dockerfile create mode 100644 github-actions-runner/add_T2T_part_to_Dockerfile.sh create mode 100644 workflow/scripts/scNOVA_scripts/assert_list_of_cells.py diff --git a/.gitignore b/.gitignore index 406d336a..a140b637 100644 --- a/.gitignore +++ b/.gitignore @@ -218,4 +218,6 @@ LOGS_DEV/ # scTRIP multiplot workflow/scripts/plotting/scTRIP_multiplot/scTRIPmultiplot -workflow/config/scTRIP_multiplot.ok \ No newline at end of file +workflow/config/scTRIP_multiplot.ok +args.output +scNOVA_env_costea.yaml diff --git a/.tests/config/simple_config.yaml b/.tests/config/simple_config.yaml index 48e84da5..884e952f 100644 --- a/.tests/config/simple_config.yaml +++ b/.tests/config/simple_config.yaml @@ -3,10 +3,10 @@ # -------------------------------------------------------- # MosaiCatcher version -version: 2.2.2 +version: 2.2.3 # Ashleys-QC pipeline version -ashleys_pipeline_version: 2.2.2 +ashleys_pipeline_version: 2.2.3 # Email for notifications about the pipeline's status email: "" @@ -14,6 +14,9 @@ email: "" # List of samples to process if multiple are specified samples_to_process: [] +# Plate size +plate_size: 96 + # -------------------------------------------------------- # Data location & I/O # -------------------------------------------------------- diff --git a/afac/update_timestamps.py b/afac/update_timestamps.py new file mode 100644 index 00000000..84cb551a --- /dev/null +++ b/afac/update_timestamps.py @@ -0,0 +1,25 @@ +import os, sys +import time +from pathlib import Path + + +def update_timestamps(directory): + """ + Update the access and modification times of all files in the given directory and its subdirectories. + + :param directory: Path to the directory + """ + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith(".fastq.gz"): + continue + file_path = Path(root) / file + current_time = time.time() + print(file_path) + os.utime(file_path, (current_time, current_time)) + print(f"Updated timestamp for: {file_path}") + + +# Example usage +directory_path = sys.argv[1] +update_timestamps(directory_path) diff --git a/config/config.yaml b/config/config.yaml index 5a3b5098..3809643d 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -14,6 +14,9 @@ email: "" # List of samples to process if multiple are specified samples_to_process: [] +# Plate size +plate_size: 96 + # -------------------------------------------------------- # Data location & I/O # -------------------------------------------------------- diff --git a/config/config_metadata.yaml b/config/config_metadata.yaml index 97c1af02..bff78ba9 100644 --- a/config/config_metadata.yaml +++ b/config/config_metadata.yaml @@ -135,3 +135,9 @@ use_strandscape_labels:: required: False default: False lint_check: False +plate_size:: + desc: "Plate size used for the sequencing (96/384)" + type: int + required: True + default: 96 + lint_check: False diff --git a/github-actions-runner/Dockerfile-2.2.2.dockerfile b/github-actions-runner/Dockerfile-2.2.2.dockerfile new file mode 100644 index 00000000..06f3ea66 --- /dev/null +++ b/github-actions-runner/Dockerfile-2.2.2.dockerfile @@ -0,0 +1,227 @@ +FROM condaforge/mambaforge:latest +LABEL io.github.snakemake.containerized="true" +LABEL io.github.snakemake.conda_env_hash="77eaa388d65d5205b87324fb0adb89561bc0e532a328995990a1d580aeb894ae" + +# Step 1: Retrieve conda environments + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml +# prefix: /conda-envs/5681728a49bd83ceed09ba194330c858 +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - bwa ==0.7.17 +RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858 +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml +# prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - fastqc ==0.11.9 +RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml + +# Conda environment: +# source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_base.yaml +# prefix: /conda-envs/87c04f5d115eff742eca84455513deba +# name: ashleys_base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - samtools +# - tabix +# - bwa +# - sambamba +# - mosaicatcher +# # - alfred +# - ashleys-qc +# - pandas +# # PUBLISHDIR +# - rsync +# # MULTIQC +# - multiqc +# # Fix sklearn update +# - scikit-learn=1.2.2 +RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba +ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml + +# Conda environment: +# source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_rtools.yaml +# prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +# name: rtools +# channels: +# - conda-forge +# - bioconda +# - r +# - anaconda +# dependencies: +# # - bioconductor-biocparallel +# # - bioconductor-bsgenome +# # - bioconductor-bsgenome.hsapiens.ucsc.hg19 +# # - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# # - bioconductor-fastseg +# # - bioconductor-genomicalignments +# - bioconductor-genomicranges +# # - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - r-assertthat +# - r-base +# # - r-biocmanager +# - r-cowplot +# - r-data.table +# # - r-devtools +# # - r-doparallel +# # - r-foreach +# - r-ggplot2 +# # - r-gtools +# - r-reshape2 +# # - r-zoo +# # - r-dplyr +# # - r-mc2d +# # - r-pheatmap +# # - bioconductor-complexheatmap +# # - r-gplots +# - r-scales +# - r-rcolorbrewer +# # - r-stringr +# - r-cairo +# - fonts-anaconda +# # NEW +# - bioconductor-edger +# - r-r.utils +# # PLATE PLOT +# - r-dplyr +# - r-platetools +# - r-viridis +# # GC_correction +# - r-tidyr +# - r-ggpubr +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_base.yaml +# prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b +# name: mc-base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - pandas +# - intervaltree +# - scipy +# - pysam +# - tqdm +# - perl +# - pypdf2 +# - parmap +# # NEW +# - pyyaml +# - seaborn +# - matplotlib +# # SOLVE se-pe detection +# - samtools +# # ArbiGent Hufsah deps +# - pytables +# - xopen +RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b +COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_bioinfo_tools.yaml +# prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +# name: mc-bioinfo-tools +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - bcftools +# - freebayes +# - mosaicatcher +# - samtools +# - tabix +# - whatshap +RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml + +# Conda environment: +# source: workflow/envs/rtools.yaml +# prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931 +# name: rtools +# channels: +# - bioconda +# - conda-forge +# - r +# - anaconda +# dependencies: +# # # NEW +# - strandphaser +# # ############### +# - bioconductor-biocparallel +# - bioconductor-bsgenome +# - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# - bioconductor-complexheatmap +# # - bioconductor-fastseg +# - bioconductor-genomicalignments +# - bioconductor-genomicranges +# - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - fonts-anaconda +# - r-assertthat +# - r-base +# - r-biocmanager +# - r-cairo +# - r-cowplot +# - r-data.table +# - r-devtools +# - r-doparallel +# - r-dplyr +# - r-foreach +# - r-ggplot2 +# - r-gplots +# - r-gtools +# - r-mc2d +# - r-rcolorbrewer +# - r-reshape2 +# - r-scales +# - r-stringr +# # SV_CALLS_DEV +# # - r-zoo +# - r-r.utils +# - r-ggnewscale +# # HEATMAP +# - r-tidyr +# # ARBIGENT +# - r-reshape +# - r-optparse +# - r-tidyr +# - r-ggbeeswarm +# - r-pheatmap +# # GC_correction +# - r-ggpubr +# - bioconductor-edger +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931 +COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml + +# Step 2: Generate conda environments + +RUN mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \ + mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \ + mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \ + mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \ + mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \ + mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \ + mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \ + mamba clean --all -y diff --git a/github-actions-runner/Dockerfile-2.2.3.dockerfile b/github-actions-runner/Dockerfile-2.2.3.dockerfile new file mode 100644 index 00000000..aa4d1c42 --- /dev/null +++ b/github-actions-runner/Dockerfile-2.2.3.dockerfile @@ -0,0 +1,299 @@ +FROM condaforge/mambaforge:latest +LABEL io.github.snakemake.containerized="true" +LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb9a77747c17f62707ea2f67a667" + +# Step 1: Retrieve conda environments + +# Conda environment: +# source: ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml +# prefix: /conda-envs/87c04f5d115eff742eca84455513deba +# name: ashleys_base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - samtools +# - tabix +# - bwa +# - sambamba +# - mosaicatcher +# # - alfred +# - ashleys-qc +# - pandas +# # PUBLISHDIR +# - rsync +# # MULTIQC +# - multiqc +# # Fix sklearn update +# - scikit-learn=1.2.2 +RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba +COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml + +# Conda environment: +# source: ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml +# prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +# name: rtools +# channels: +# - conda-forge +# - bioconda +# - r +# - anaconda +# dependencies: +# # - bioconductor-biocparallel +# # - bioconductor-bsgenome +# # - bioconductor-bsgenome.hsapiens.ucsc.hg19 +# # - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# # - bioconductor-fastseg +# # - bioconductor-genomicalignments +# - bioconductor-genomicranges +# # - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - r-assertthat +# - r-base +# # - r-biocmanager +# - r-cowplot +# - r-data.table +# # - r-devtools +# # - r-doparallel +# # - r-foreach +# - r-ggplot2 +# # - r-gtools +# - r-reshape2 +# # - r-zoo +# # - r-dplyr +# # - r-mc2d +# # - r-pheatmap +# # - bioconductor-complexheatmap +# # - r-gplots +# - r-scales +# - r-rcolorbrewer +# # - r-stringr +# - r-cairo +# - fonts-anaconda +# # NEW +# - bioconductor-edger +# - r-r.utils +# # PLATE PLOT +# - r-dplyr +# - r-platetools +# - r-viridis +# # GC_correction +# - r-tidyr +# - r-ggpubr +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml +# prefix: /conda-envs/5681728a49bd83ceed09ba194330c858 +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - bwa ==0.7.17 +RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858 +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml +# prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - fastqc ==0.11.9 +RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_base.yaml +# prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b +# name: mc-base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - pandas +# - intervaltree +# - scipy +# - pysam +# - tqdm +# - perl +# - pypdf2 +# - parmap +# # NEW +# - pyyaml +# - seaborn +# - matplotlib +# # SOLVE se-pe detection +# - samtools +# # ArbiGent Hufsah deps +# - pytables +# - xopen +RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b +COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_bioinfo_tools.yaml +# prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +# name: mc-bioinfo-tools +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - bcftools +# - freebayes +# - mosaicatcher +# - samtools +# - tabix +# - whatshap +RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml + +# Conda environment: +# source: workflow/envs/rtools.yaml +# prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931 +# name: rtools +# channels: +# - bioconda +# - conda-forge +# - r +# - anaconda +# dependencies: +# # # NEW +# - strandphaser +# # ############### +# - bioconductor-biocparallel +# - bioconductor-bsgenome +# - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# - bioconductor-complexheatmap +# # - bioconductor-fastseg +# - bioconductor-genomicalignments +# - bioconductor-genomicranges +# - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - fonts-anaconda +# - r-assertthat +# - r-base +# - r-biocmanager +# - r-cairo +# - r-cowplot +# - r-data.table +# - r-devtools +# - r-doparallel +# - r-dplyr +# - r-foreach +# - r-ggplot2 +# - r-gplots +# - r-gtools +# - r-mc2d +# - r-rcolorbrewer +# - r-reshape2 +# - r-scales +# - r-stringr +# # SV_CALLS_DEV +# # - r-zoo +# - r-r.utils +# - r-ggnewscale +# # HEATMAP +# - r-tidyr +# # ARBIGENT +# - r-reshape +# - r-optparse +# - r-tidyr +# - r-ggbeeswarm +# - r-pheatmap +# # GC_correction +# - r-ggpubr +# - bioconductor-edger +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931 +COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_DL.yaml +# prefix: /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 +# name: scNOVA_DL +# channels: +# - conda-forge +# - anaconda +# dependencies: +# - tensorflow=1.15.0 +# - scikit-learn=0.21.3 +# - python=3.7.4 +# - matplotlib=3.1.1 +# - pandas=0.25.3 +# - h5py=2.10.0 +# - numpy +# # scNOVA archive +# - unzip +# # Fix +RUN mkdir -p /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 +COPY workflow/envs/scNOVA/scNOVA_DL.yaml /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_R.yaml +# prefix: /conda-envs/193f60d48796dd17eb847ea689b863a9 +# name: scNOVA +# channels: +# - bioconda +# - conda-forge +# - r +# dependencies: +# - bioconductor-deseq2=1.30.0 +# - r-matrixstats=0.58.0 +# - r-pheatmap=1.0.12 +# - r-gplots=3.1.1 +# - r-umap=0.2.7.0 +# - r-rtsne=0.15 +# - r-factoextra=1.0.7 +# - r-pracma=2.3.3 +# - bioconductor-chromvar=1.12.0 +# - r-nabor=0.5.0 +# - bioconductor-motifmatchr=1.12.0 +# - bioconductor-bsgenome.hsapiens.ucsc.hg38=1.4.3 +# - bioconductor-jaspar2016=1.18.0 +# - r-codetools=0.2_18 +# - r-fitdistrplus +# - r-doparallel +# - r-foreach +RUN mkdir -p /conda-envs/193f60d48796dd17eb847ea689b863a9 +COPY workflow/envs/scNOVA/scNOVA_R.yaml /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml +# prefix: /conda-envs/ca9641251a8cb0057003875ad776c49f +# name: scNOVA_bioinfo_tools +# channels: +# - conda-forge +# - bioconda +# - anaconda +# dependencies: +# - samtools +# - biobambam +# - bedtools +RUN mkdir -p /conda-envs/ca9641251a8cb0057003875ad776c49f +COPY workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml + +# Step 2: Generate conda environments + +RUN mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \ + mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \ + mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \ + mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \ + mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \ + mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \ + mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \ + mamba env create --prefix /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 --file /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml && \ + mamba env create --prefix /conda-envs/193f60d48796dd17eb847ea689b863a9 --file /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml && \ + mamba env create --prefix /conda-envs/ca9641251a8cb0057003875ad776c49f --file /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml && \ + mamba clean --all -y +# CUSTOM PART +RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/ +COPY /workflow/scripts/utils/install_R_package.R /conda-envs/ +RUN chmod -R 0777 /conda-envs/598c87b6c764d05e0c66953cc67f2931/lib/R/library && /conda-envs/598c87b6c764d05e0c66953cc67f2931/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz diff --git a/github-actions-runner/add_T2T_part_to_Dockerfile.sh b/github-actions-runner/add_T2T_part_to_Dockerfile.sh new file mode 100644 index 00000000..7c631edd --- /dev/null +++ b/github-actions-runner/add_T2T_part_to_Dockerfile.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Check if a Dockerfile path is provided +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +DOCKERFILE=$1 + +# Check if the Dockerfile exists +if [ ! -f "$DOCKERFILE" ]; then + echo "Dockerfile not found: $DOCKERFILE" + exit 1 +fi + +# Extract the R environment variable +Renv=$(grep -P "\/rtools.*environment\.yaml" "$DOCKERFILE" | sed "s/\//\t/g" | cut -f 5) + +# Check if Renv is extracted +if [ -z "$Renv" ]; then + echo "R environment variable not found in the Dockerfile." + exit 1 +fi + +# Append custom steps to the Dockerfile +{ + echo '\n' + echo "# CUSTOM PART" + echo "RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/" + echo "COPY /workflow/scripts/utils/install_R_package.R /conda-envs/" + echo "RUN chmod -R 0777 /conda-envs/$Renv/lib/R/library && /conda-envs/$Renv/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz" +} >>"$DOCKERFILE" + +echo "Custom steps added to $DOCKERFILE" diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py index c0f259f9..4c1a6614 100644 --- a/watchdog_pipeline/watchdog_pipeline.py +++ b/watchdog_pipeline/watchdog_pipeline.py @@ -46,7 +46,8 @@ ] profile_dry_run = [ "--profile", - "workflow/snakemake_profiles/local/conda_singularity/", + "workflow/snakemake_profiles/local/conda/", + # "workflow/snakemake_profiles/local/conda_singularity/", "-c", "1", ] @@ -272,7 +273,7 @@ def check_unprocessed_folder(self): # last_message_timestamp = last_message_timestamp main_df = list() - if workflows_data: + if len(workflows_data) > 0: for plate in total_list_runs: # print(plate) if plate.split("-")[0][:2] == "20": @@ -383,6 +384,7 @@ def check_unprocessed_folder(self): pd.options.display.max_rows = 999 pd.options.display.max_colwidth = 30 # pd.options.display.max_columns = 50 + main_df = pd.DataFrame(main_df) # main_df.loc[(main_df["labels"] == True) & (main_df["report"] == True), "real_status"] = "Completed" main_df.loc[ @@ -418,7 +420,7 @@ def check_unprocessed_folder(self): main_df["real_status"] = main_df["real_status"].fillna( "Error (to investigate))" ) - + print(workflows_data["workflows"]) print(main_df) dry_run_db = False @@ -454,6 +456,9 @@ def check_unprocessed_folder(self): e for e in workflows_data["workflows"] if e["id"] == workflow_id ] + print(panoptes_entry) + print(panoptes_data) + if panoptes_data: panoptes_data = panoptes_data[0] if "completed_at" not in panoptes_data: @@ -530,7 +535,7 @@ def check_unprocessed_folder(self): for row in main_df.loc[ # (main_df["multiqc_scratch"] == False) (main_df["multiqc_scratch"] == False) - & (main_df["report"] == False) + # & (main_df["report"] == False) ].to_dict("records"): logging.info(row) diff --git a/workflow/Snakefile b/workflow/Snakefile index 5acf31fe..4262bd44 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,19 +19,29 @@ if config["ashleys_pipeline"] is True: module ashleys_qc: snakefile: - github( - "friendsofstrandseq/ashleys-qc-pipeline", - path="workflow/Snakefile", - tag=str(config["ashleys_pipeline_version"]), - ) + "../../ashleys-qc-pipeline/workflow/Snakefile" + # github( + # "friendsofstrandseq/ashleys-qc-pipeline", + # path="workflow/Snakefile", + # tag=str(config["ashleys_pipeline_version"]), + # ) config: config use rule * from ashleys_qc as ashleys_* - localrules: - ashleys_genecore_symlink, - symlink_selected_bam, + if config["ashleys_pipeline_only"] is True: + + localrules: + ashleys_genecore_symlink, + ashleys_symlink_selected_bam, + + else: + + localrules: + ashleys_genecore_symlink, + ashleys_symlink_selected_bam, + symlink_selected_bam, else: diff --git a/workflow/envs/scNOVA/scNOVA_DL.yaml b/workflow/envs/scNOVA/scNOVA_DL.yaml index 8530fdf8..775c36d8 100644 --- a/workflow/envs/scNOVA/scNOVA_DL.yaml +++ b/workflow/envs/scNOVA/scNOVA_DL.yaml @@ -12,3 +12,4 @@ dependencies: - numpy # scNOVA archive - unzip + # Fix diff --git a/workflow/rules/aggregate_fct.smk b/workflow/rules/aggregate_fct.smk index 278d45b9..5de9c6e1 100644 --- a/workflow/rules/aggregate_fct.smk +++ b/workflow/rules/aggregate_fct.smk @@ -169,7 +169,7 @@ def aggregate_cells_scTRIP_multiplot(wildcards): cell_list = df.cell.tolist() return expand( - "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png", + "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.pdf", folder=config["data_location"], sample=wildcards.sample, cell=cell_list, diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 4c42e1b8..7af4b6b4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -11,6 +11,12 @@ import os, sys os.environ["LC_CTYPE"] = "C" +# print(config["data_location"]) + +if config["ashleys_pipeline"] is True and config["genecore"] is True: + config["data_location"] = "/".join(config["data_location"].split("/")[:-1]) + + envvars: "LC_CTYPE", @@ -131,6 +137,9 @@ class HandleInput: genecore=False, genecore_path=str, ): + # print(input_path) + # print(genecore_path) + # print("\n") if genecore is False: df_config_files = self.handle_input_data(thisdir=input_path, bam=bam) elif genecore is True: @@ -154,56 +163,69 @@ class HandleInput: Returns: _type_: _description_ """ - complete_df_list = list() + from pprint import pprint + from collections import Counter - # List of folders/files to not consider (restrict to samples only) - l = sorted( - [ - e - for e in os.listdir( - "{genecore_prefix}/{date_folder}".format( - genecore_prefix=config["genecore_prefix"], - date_folder=config["genecore_date_folder"], - ) - ) - if e.endswith(".txt.gz") - ] + directory_path = f"{config['genecore_prefix']}/{config['genecore_date_folder']}" + + l = sorted([e for e in os.listdir(directory_path) if e.endswith(".txt.gz")]) + + complete_df_list = list() + # print(thisdir) + genecore_prefix = config["genecore_prefix"] + date_folder = config["genecore_date_folder"] + # print(f"{genecore_prefix}/{date_folder}") + + # Pattern to extract sample name and index + pattern = re.compile(r"(.*_lane1)(.*?)(iTRU|PE20)(.*?)(\d{2})(?:_1_|_2_)") + + samples = list() + prefixes = list() + indexes = list() + plate_types = list() + d_master = collections.defaultdict( + lambda: { + "indexes": set(), + "file_prefix": "", + "plate_type": "", + "index_pattern": "", + } ) - # print(l) - # Create a list of files to process for each sample - d_master = collections.defaultdict(dict) - sub_l = list() - for j, e in enumerate(l): - sub_l.append(e) - if (j + 1) % 192 == 0: - common_element = findstem(sub_l) - l_elems = common_element.split("lane1") - # print(sub_l) - # print(common_element) - # print(l_elems) - # print(l_elems[1].split("{regex_element}".format(regex_element=config["genecore_regex_element"])) - prefix = l_elems[0] - # technician_name = l_elems[0].split("_")[-2] - sample = l_elems[1].split( - "{regex_element}".format( - regex_element=config["genecore_regex_element"] - ) - )[0] - index = l_elems[1].split( - "{regex_element}".format( - regex_element=config["genecore_regex_element"] + + # First pass: Count occurrences of each sample_name + file_counts_per_sample = Counter() + for file_path in l: + match = pattern.search(file_path) + if match: + sample_name = match.group(2) + file_counts_per_sample[sample_name] += 1 + + # Second pass: Process files and determine plate type per sample + for j, file_path in enumerate(sorted(l)): + match = pattern.search(file_path) + if match: + sample_name = match.group(2) + index = match.group(4) + indexes.append(index) + d_master[sample_name]["indexes"].add(index) + file_count = file_counts_per_sample[sample_name] + + # Determine plate type using modulo 96 operation + if file_count % 96 != 0: + raise ValueError( + f"Invalid file count for sample {sample_name} with file count {file_count}. Must be a multiple of 96." ) - )[1] - # pe_index = common_element[-1] - sub_l = list() - - d_master[sample]["prefix"] = prefix - # d_master[sample]["technician_name"] = technician_name - d_master[sample]["index"] = index - d_master[sample]["common_element"] = common_element - # from pprint import pprint - # pprint(d_master) - # exit() + plate_type = int(file_count / 2) + + if (j + 1) % file_count == 0: + prefixes.append(match.group(3)) + d_master[sample_name]["file_prefix"] = match.group(1) + d_master[sample_name]["index_pattern"] = match.group(3) + plate = directory_path.split("/")[-1] + samples.append(sample_name) + plate_types.append(plate_type) + d_master[sample_name]["plate_type"] = plate_type + samples_to_process = ( config["samples_to_process"] if len(config["samples_to_process"]) > 0 @@ -220,8 +242,8 @@ class HandleInput: "{data_location}/{sample}/fastq/{sample}{regex_element}{index}{cell_nb}.{pair}.fastq.gz", data_location=config["data_location"], sample=sample, - regex_element=config["genecore_regex_element"], - index=d_master[sample]["index"], + regex_element=d_master[sample]["index_pattern"], + index=d_master[sample]["indexes"], cell_nb=[str(e).zfill(2) for e in list(range(1, 97))], pair=["1", "2"], ) @@ -229,7 +251,8 @@ class HandleInput: if sample in samples_to_process ] genecore_list = [sub_e for e in genecore_list for sub_e in e] - # pprint(genecore_list) + # pprint(d_master) + complete_df_list = list() for sample in d_master: @@ -248,11 +271,12 @@ class HandleInput: df["Full_path"] = df[["Folder", "File"]].apply( lambda r: f"{r['Folder']}/{r['File']}.fastq.gz", axis=1 ) + df["Genecore_path"] = df["File"].apply( - lambda r: f"{config['genecore_prefix']}/{config['genecore_date_folder']}/{d_master[sample]['prefix']}lane1{r.replace('.', '_')}_sequence.txt.gz" + lambda r: f"{config['genecore_prefix']}/{config['genecore_date_folder']}/{d_master[sample]['file_prefix']}{r.replace('.', '_')}_sequence.txt.gz" ) df["Genecore_file"] = df["File"].apply( - lambda r: f"{d_master[sample]['prefix']}lane1{r.replace('.', '_')}" + lambda r: f"{d_master[sample]['file_prefix']}{r.replace('.', '_')}" ) df["Genecore_file"] = df["Genecore_file"].apply( lambda r: "_".join(r.split("_")[:-1]) @@ -375,12 +399,18 @@ def findstem(arr): # Create configuration file with samples +# print("config['data_location']") +# print(config["data_location"]) + c = HandleInput( input_path=config["data_location"], - genecore_path="{genecore_prefix}/{genecore_date_folder}".format( + genecore_path="{genecore_prefix}".format( genecore_prefix=config["genecore_prefix"], - genecore_date_folder=config["genecore_date_folder"], ), + # genecore_path="{genecore_prefix}/{genecore_date_folder}".format( + # genecore_prefix=config["genecore_prefix"], + # genecore_date_folder=config["genecore_date_folder"], + # ), output_path="{data_location}/config/config_df.tsv".format( data_location=config["data_location"] ), @@ -532,8 +562,12 @@ def onsuccess_fct(log): log, "SUCCESS", config, config_metadata ) shell( - 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {}'.format( - config["version"], config["data_location"], config["email"], log_path_new + 'mail -s "[smk-wf-catalog/mosaicatcher-pipeline] v{} - [{}--{}] - SUCCESS" {} < {}'.format( + config["version"], + config["data_location"].split("/")[-1], + ";".join(samples), + config["email"], + log_path_new, ) ) @@ -546,8 +580,12 @@ def onerror_fct(log): log, "ERROR", config, config_metadata ) shell( - 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {}'.format( - config["version"], config["data_location"], config["email"], log_path_new + 'mail -s "[smk-wf-catalog/mosaicatcher-pipeline] v{} - [{}--{}] - ERROR" {} < {}'.format( + config["version"], + config["data_location"].split("/")[-1], + ";".join(samples), + config["email"], + log_path_new, ) ) @@ -564,308 +602,26 @@ def get_scnova_final_output(wildcards): # abbreviate_names = False l = [ - # expand( - # "{folder}/{sample}/scNOVA_input_user/{clone}_sv_calls_all_print.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # clone=clones[wildcards.sample], - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_orientation_CN_correct0.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_input_user/sv_calls_all_print_CREs.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_sort_geneid.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}_sc.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab_final.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{sample}_{clone}_orientation_norm_qc.pdf", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_orientation_norm.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab_final.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{sample}_{clone}_Resid_orientation_qc.pdf", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_Resid_orientation.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_all_orientation_norm_var_GC_CpG_RT_T_comb3_{clone}.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Expression_all_{clone}.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_all_TSS_matrix_woM_all_RT_{clone}.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train40_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train20_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train5_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train40_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train20_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train5_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_plots/Result_scNOVA_plots_{sample}.pdf", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/result_PLSDA_{sample}.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result_plots/Result_scNOVA_plots_{sample}_alternative_PLSDA.pdf", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num_sort_for_chromVAR.txt", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W1.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W2.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C2.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam.bai", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam.bai", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleA/result.H1.bam", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleB/result.H2.bam", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_input_user/strandphaser_output_copy.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2_sort.txt", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2_sort.txt", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), ] l = [sub_e for e in l for sub_e in e] return l @@ -1096,16 +852,26 @@ def get_all_plots(wildcards): ), ) - # Run summary section + # Config section l_outputs.extend( expand( - "{folder}/{sample}/config/run_summary.txt", + "{folder}/{sample}/config/config.yaml", folder=config["data_location"], sample=wildcards.sample, ), ) + # Run summary section + + # l_outputs.extend( + # expand( + # "{folder}/{sample}/config/run_summary.txt", + # folder=config["data_location"], + # sample=wildcards.sample, + # ), + # ) + # from pprint import pprint # pprint(l_outputs) return l_outputs diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk index 080d64b7..f1a0e74e 100755 --- a/workflow/rules/count.smk +++ b/workflow/rules/count.smk @@ -136,6 +136,7 @@ rule symlink_selected_bam: rule remove_unselected_bam: input: + labels="{folder}/{sample}/cell_selection/labels.tsv", bam=unselected_input_bam, bai=unselected_input_bai, output: @@ -196,7 +197,7 @@ if ( "../envs/mc_base.yaml" shell: """ - workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size {params.window} > {output.merged} 2>> {log} + workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size {params.window} --output {output.merged} """ else: diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk index 1acc1e55..221c8610 100644 --- a/workflow/rules/plots.smk +++ b/workflow/rules/plots.smk @@ -17,7 +17,7 @@ if config["ashleys_pipeline"] is False: # "{folder}/{sample}/plots/counts/CountComplete.raw.pdf", report( "{folder}/{sample}/plots/counts/CountComplete.raw.pdf", - category="Mosaic Counts", + category="Mosaic counts", subcategory="{sample}", labels={"Cell": "ALL", "Type": "raw"}, ), @@ -40,7 +40,7 @@ rule divide_pdf: report( "{folder}/{sample}/plots/counts_raw/{cell}.{i, \d+}.pdf", caption="../report/mosaic_counts.rst", - category="Mosaic counts", + category="Mosaic counts cellwise", subcategory="{sample}", labels={"Cell": "{cell}", "Nb": "{i}", "Type": "raw"}, ), @@ -306,7 +306,7 @@ rule scTRIP_multiplot: sv_counts="{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", output: figure=report( - "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png", + "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.pdf", category="scTRIP multiplot", subcategory="{sample}", labels={"Cell": "{cell}", "Chrom": "{chrom}"}, @@ -315,6 +315,7 @@ rule scTRIP_multiplot: "{folder}/log/scTRIP_multiplot/{sample}/{cell}/{chrom}.log", conda: "../envs/rtools.yaml" + container: None resources: mem_mb=get_mem_mb, shell: diff --git a/workflow/rules/regenotyping.smk b/workflow/rules/regenotyping.smk index ebb451df..2bfae7b0 100644 --- a/workflow/rules/regenotyping.smk +++ b/workflow/rules/regenotyping.smk @@ -6,6 +6,7 @@ rule mergeBams: check=remove_unselected_fct, bam=selected_input_bam, bai=selected_input_bai, + labels="{folder}/{sample}/cell_selection/labels.tsv", output: temp("{folder}/{sample}/merged_bam/merged.raw.bam"), log: diff --git a/workflow/rules/scNOVA.smk b/workflow/rules/scNOVA.smk index 04c108d2..9f6c7c5b 100755 --- a/workflow/rules/scNOVA.smk +++ b/workflow/rules/scNOVA.smk @@ -1,8 +1,24 @@ +rule assert_list_of_cells: + input: + labels="{folder}/{sample}/cell_selection/labels.tsv", + subclone_list="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", + selected_cells="{folder}/{sample}/selected/", + output: + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", + log: + "{folder}/{sample}/log/assert_list_of_cells.log", + conda: + "../envs/mc_base.yaml" + script: + "../scripts/scNOVA_scripts/assert_list_of_cells.py" + + rule filter_sv_calls: log: "{folder}/{sample}/log/filter_sv_calls/{sample}.log", input: "{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: "{folder}/{sample}/scNOVA_input_user/sv_calls.tsv", conda: @@ -147,6 +163,7 @@ rule remove_dup: None input: bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark.bam", + assert_list_of_cells="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: bam_uniq="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", bam_metrix="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono.metrix_dup.txt", @@ -272,6 +289,7 @@ rule filter_input_subclonality: None input: "{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: "{folder}/{sample}/scNOVA_input_user/input_subclonality_{clone}.txt", conda: @@ -973,6 +991,7 @@ rule split_bam_WC: None input: "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: bam_header="{folder}/{sample}/scNOVA_bam_modified/{cell}.header_WC.sam", bam_C1="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam", diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 39353b67..4eaf8464 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -139,3 +139,18 @@ rule samtools_faindex: mem_mb=get_mem_mb_heavy, shell: "samtools faidx {input}" + + +rule save_config: + input: + "config/config.yaml", + output: + "{folder}/{sample}/config/config.yaml", + log: + "{folder}/log/save_config/{sample}.log", + conda: + "../envs/mc_base.yaml" + resources: + mem_mb=get_mem_mb, + script: + "../scripts/utils/dump_config.py" diff --git a/workflow/scripts/normalization/merge-blacklist.py b/workflow/scripts/normalization/merge-blacklist.py index 9a484eec..750d3966 100755 --- a/workflow/scripts/normalization/merge-blacklist.py +++ b/workflow/scripts/normalization/merge-blacklist.py @@ -16,6 +16,7 @@ def main(): type=int, help="If the distance between two blacklisted intervals is below this threshold, they are merged.", ) + parser.add_argument("--output", default=None, help="Output file name") parser.add_argument( "--whitelist", default=None, help="TSV file with intervals to be removed from the blacklist (columns: chrom, start, end)." ) @@ -71,7 +72,7 @@ def main(): print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr) - norm_table.to_csv(sys.stdout, index=False, sep="\t") + norm_table.to_csv(args.output, index=False, sep="\t") ## Identify "complex" intervals # segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':partial(is_complex, ignore_haplotypes=args.ignore_haplotypes, min_cell_count=args.min_cell_count)}).reset_index().sort_values(['chrom','start','end']) diff --git a/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py b/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py new file mode 100644 index 00000000..651fb7c6 --- /dev/null +++ b/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py @@ -0,0 +1,57 @@ +import pandas as pd +import os + + +def main(labels_file, subclone_file, selected_folder, output_file): + # Read labels.tsv + labels_df = pd.read_csv(labels_file, sep="\t") + labels_cells = set( + labels_df["cell"].str.replace(".sort.mdup.bam", "").values.tolist() + ) + + # Read input_subclonality.txt + input_subclonality = pd.read_csv(subclone_file, sep="\t") + subclone_cells = set(input_subclonality["Filename"].values.tolist()) + + # List files in selected/ folder and process filenames + selected_cells = set( + file.replace(".sort.mdup.bam", "") + for file in os.listdir(selected_folder) + if file.endswith(".sort.mdup.bam") + ) + + # Compare sets + if labels_cells == subclone_cells == selected_cells: + result = "PASS: All cell lists match." + else: + result = "FAIL: Cell lists do not match." + + # Logging details of the mismatch + with open(output_file, "w") as output: + output.write("Labels cells: {}\n".format(labels_cells)) + output.write("Subclone cells: {}\n".format(subclone_cells)) + output.write("Selected cells: {}\n".format(selected_cells)) + output.write("Discrepancy details:\n") + output.write( + "In labels but not in subclone: {}\n".format(labels_cells - subclone_cells) + ) + output.write( + "In subclone but not in labels: {}\n".format(subclone_cells - labels_cells) + ) + output.write( + "In labels but not in selected: {}\n".format(labels_cells - selected_cells) + ) + output.write( + "In selected but not in labels: {}\n".format(selected_cells - labels_cells) + ) + output.write(result) + + +if __name__ == "__main__": + # Extracting Snakemake input variables + labels_file = snakemake.input.labels + subclone_file = snakemake.input.subclone_list + selected_folder = snakemake.input.selected_cells + output_file = snakemake.output[0] + + main(labels_file, subclone_file, selected_folder, output_file) diff --git a/workflow/scripts/utils/dump_config.py b/workflow/scripts/utils/dump_config.py index 4701706a..6b299ee6 100644 --- a/workflow/scripts/utils/dump_config.py +++ b/workflow/scripts/utils/dump_config.py @@ -1,28 +1,22 @@ -import json -import time +import yaml -timestamp = time.strftime("%Y%m%d-%H%M%S") -configured_samples = [] -for key in config.keys(): - if not key.startswith("sample_description"): - continue - sample = key.split("_", 2)[-1] - configured_samples.append(sample) +def update_config(input_file, output_file): + # Load the existing config file + with open(input_file, "r") as file: + flat_file_config = yaml.safe_load(file) -if configured_samples: - second_dump = "config_{}_{}.json".format(timestamp, "_".join(sorted(configured_samples))) -else: - second_dump = "config_{}.json".format(timestamp) + # Update the config with Snakemake parameters + for key, value in snakemake.config.items(): + flat_file_config[key] = value -with open(output[0], "w") as fake: - _ = fake.write(second_dump + "\n(Full configuration dump)") + # Save the updated config to the output file + with open(output_file, "w") as file: + yaml.dump(flat_file_config, file) -with open(second_dump, "w") as dump: - json.dump( - config, - dump, - ensure_ascii=True, - indent=2, - sort_keys=True, - ) + +if __name__ == "__main__": + input_config = snakemake.input[0] + output_config = snakemake.output[0] + + update_config(input_config, output_config)