diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 174a05265..e9cce74e1 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -25,6 +25,8 @@ Changed - Remove ``rnaseqc-qc`` from RNA-seq workflows - Remove ``cut_and_run.yml`` - Rename ``workflow-cutnrun-beta`` to ``workflow-cutnrun`` +- Remove ``upload-sc-10x``, ``cellranger-count`` and ``cellranger-mkref`` + processes Fixed ----- diff --git a/resolwe_bio/processes/import_data/seq_reads_10x.py b/resolwe_bio/processes/import_data/seq_reads_10x.py deleted file mode 100644 index 52ec3a525..000000000 --- a/resolwe_bio/processes/import_data/seq_reads_10x.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Import ScRNA-Seq reads.""" - -from plumbum import TEE - -from resolwe.process import ( - Cmd, - FileField, - FileHtmlField, - ListField, - Process, - SchedulingClass, -) - - -class ImportScRNA10x(Process): - """Import 10x scRNA reads in FASTQ format.""" - - slug = "upload-sc-10x" - name = "Reads (scRNA 10x)" - process_type = "data:screads:10x:" - version = "1.4.1" - category = "Import" - scheduling_class = SchedulingClass.BATCH - entity = { - "type": "sample", - "descriptor_schema": "sample", - } - requirements = { - "expression-engine": "jinja", - "executor": { - "docker": {"image": "public.ecr.aws/genialis/resolwebio/common:4.1.1"} - }, - } - data_name = '{{ reads.0.file|default("?") }}' - - class Input: - """Input fields to process ImportScRNA10x.""" - - barcodes = ListField( - FileField( - description="Barcodes file(s) in FASTQ format. Usually the forward FASTQ files (R1).", - ), - label="Barcodes (.fastq.gz)", - ) - reads = ListField( - FileField( - description="Reads file(s) in FASTQ format. Usually the reverse FASTQ files (R2).", - ), - label="Reads (.fastq.gz)", - ) - - class Output: - """Output fields to process ImportScRNA10x.""" - - barcodes = ListField(FileField(), label="Barcodes") - reads = ListField(FileField(), label="Reads") - fastqc_url_barcodes = ListField( - FileHtmlField(), - label="Quality control with FastQC (Barcodes)", - ) - fastqc_url_reads = ListField( - FileHtmlField(), - label="Quality control with FastQC (Reads)", - ) - - def run(self, inputs, outputs): - """Run the analysis.""" - barcodes_files = [ - barcode.import_file(imported_format="compressed") - for barcode in inputs.barcodes - ] - reads_files = [ - reads_file.import_file(imported_format="compressed") - for reads_file in inputs.reads - ] - - # Check if the number of input fastqs is the same - if len(barcodes_files) != len(reads_files): - self.error("The number of reads and barcodes fastqs must be the same.") - - cmd = Cmd["fastqc"] - for fastq in barcodes_files + reads_files: - cmd = cmd["{}".format(fastq)] - cmd = cmd["--extract"] - cmd = cmd["--outdir=./"] - _, _, stderr = cmd & TEE - # FastQC writes both progress and errors to stderr and exits with code 0. - # Catch if file is empty, wrong format... (Failed to process) or - # if file path does not exist, file cannot be read (Skipping). - if "Failed to process" in stderr or "Skipping" in stderr: - self.error("Failed while processing with FastQC.") - - barcodes_fastqcs = [] - reads_fastqcs = [] - for barcodes, reads in zip(barcodes_files, reads_files): - assert barcodes.endswith(".fastq.gz") - assert reads.endswith(".fastq.gz") - barcodes_name = barcodes[:-9] - reads_name = reads[:-9] - barcodes_fastqcs.append("{}_fastqc.html".format(barcodes_name)) - reads_fastqcs.append("{}_fastqc.html".format(reads_name)) - - outputs.barcodes = barcodes_files - outputs.reads = reads_files - outputs.fastqc_url_barcodes = barcodes_fastqcs - outputs.fastqc_url_reads = reads_fastqcs diff --git a/resolwe_bio/processes/scseq/cellranger.py b/resolwe_bio/processes/scseq/cellranger.py deleted file mode 100644 index 76e98072f..000000000 --- a/resolwe_bio/processes/scseq/cellranger.py +++ /dev/null @@ -1,302 +0,0 @@ -"""Cell ranger scRNA-Seq analysis.""" - -import os -from pathlib import Path -from shutil import copy, copytree - -from plumbum import TEE - -from resolwe.process import ( - Cmd, - DataField, - DirField, - FileField, - FileHtmlField, - IntegerField, - Process, - SchedulingClass, - StringField, -) - - -class CellRangerMkref(Process): - """Reference preparation tool for 10x Genomics Cell Ranger. - - Build a Cell Ranger-compatible reference from genome FASTA and gene GTF files. - https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references - - """ - - slug = "cellranger-mkref" - name = "Cell Ranger Mkref" - process_type = "data:genomeindex:10x" - version = "2.1.3" - category = "scRNA-seq" - scheduling_class = SchedulingClass.BATCH - requirements = { - "expression-engine": "jinja", - "executor": { - "docker": {"image": "public.ecr.aws/s4q6j6e8/resolwebio/scseq:2.1.0"} - }, - "resources": { - "memory": 32768, - "cores": 4, - }, - } - data_name = '{{ genome.output.fasta.file|default("?") }}' - - class Input: - """Input fields to process CellRangerMkref.""" - - genome = DataField( - data_type="seq:nucleotide:", - label="Reference genome", - ) - annotation = DataField( - data_type="annotation:gtf:", - label="Annotation", - ) - - class Output: - """Output fields to process CellRangerMkref.""" - - genome_index = DirField(label="Indexed genome") - build = StringField(label="Build") - species = StringField(label="Species") - source = StringField(label="Gene ID source") - - def run(self, inputs, outputs): - """Run the analysis.""" - genome_build = inputs.genome.output.build - annotation_build = inputs.annotation.output.build - if genome_build != annotation_build: - self.error( - "Builds of the genome {} and annotation {} do not match. Please provide genome " - "and annotation with the same build.".format( - genome_build, annotation_build - ) - ) - - genome_species = inputs.genome.output.species - annotation_species = inputs.annotation.output.species - if genome_species != annotation_species: - self.error( - "Species of genome {} and annotation {} do not match. Please provide genome " - "and annotation with the same species.".format( - genome_species, annotation_species - ) - ) - - cmd = Cmd["cellranger"]["mkref"] - cmd = cmd["--genome={}".format(genome_build)] - cmd = cmd["--genes={}".format(inputs.annotation.output.annot_sorted.path)] - cmd = cmd["--fasta={}".format(inputs.genome.output.fasta.path)] - cmd = cmd["--nthreads={}".format(self.requirements.resources.cores)] - cmd = cmd[ - "--memgb={}".format(int(self.requirements.resources.memory * 0.9 / 1024)) - ] - return_code, _, _ = cmd & TEE(retcode=None) - if return_code: - self.error("Error while running cellranger mkref.") - - os.rename(genome_build, "cellranger_index") - - outputs.genome_index = "cellranger_index" - outputs.source = inputs.annotation.output.source - outputs.species = genome_species - outputs.build = genome_build - - -class CellRangerCount(Process): - """Perform gene expression analysis. - - Generate single cell feature counts for a single library. - https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count - - """ - - slug = "cellranger-count" - name = "Cell Ranger Count" - process_type = "data:scexpression:10x" - version = "1.2.2" - category = "scRNA-seq" - scheduling_class = SchedulingClass.BATCH - entity = {"type": "sample"} - requirements = { - "expression-engine": "jinja", - "executor": { - "docker": {"image": "public.ecr.aws/s4q6j6e8/resolwebio/scseq:2.1.0"} - }, - "resources": { - "memory": 32768, - "cores": 4, - }, - } - data_name = "{{ reads|name|default('?') }}" - - class Input: - """Input fields to process ImportScRNA10x.""" - - reads = DataField( - data_type="screads:10x:", - label="10x reads data object", - ) - genome_index = DataField( - data_type="genomeindex:10x:", - label="10x genome index data object", - ) - chemistry = StringField( - label="Chemistry", - required=False, - default="auto", - description=( - "Assay configuration. By default the assay configuration is detected " - "automatically, which is the recommended mode. You should only specify " - "chemistry if there is an error in automatic detection." - ), - choices=[ - ("auto", "auto"), - ("Single Cell 3'", "threeprime"), - ("Single Cell 5'", "fiveprime"), - ("Single Cell 3' v1", "SC3Pv1"), - ("Single Cell 3' v2", "SC3Pv2"), - ("Single Cell 3' v3", "SC3Pv3"), - ("Single Cell 5' paired-end", "C5P-PE"), - ("Single Cell 5' R2-only", "SC5P-R2"), - ], - ) - trim_r1 = IntegerField( - label="Trim R1", - required=False, - description=( - "Hard-trim the input R1 sequence to this length. Note that the length " - "includes the Barcode and UMI sequences so do not set this below 26 for " - "Single Cell 3' v2 or Single Cell 5'. This and \"Trim R2\" are useful for " - "determining the optimal read length for sequencing." - ), - ) - trim_r2 = IntegerField( - label="Trim R2", - required=False, - description="Hard-trim the input R2 sequence to this length.", - ) - expected_cells = IntegerField( - label="Expected number of recovered cells", - default=3000, - ) - force_cells = IntegerField( - label="Force cell number", - required=False, - description=( - "Force pipeline to use this number of cells, bypassing the cell " - "detection algorithm. Use this if the number of cells estimated by Cell " - "Ranger is not consistent with the barcode rank plot." - ), - ) - - class Output: - """Output fields to process ImportScRNA10x.""" - - matrix_filtered = FileField(label="Matrix (filtered)") - genes_filtered = FileField(label="Genes (filtered)") - barcodes_filtered = FileField(label="Barcodes (filtered)") - matrix_raw = FileField(label="Matrix (raw)") - genes_raw = FileField(label="Genes (raw)") - barcodes_raw = FileField(label="Barcodes (raw)") - report = FileHtmlField(label="Report") - build = StringField(label="Build") - species = StringField(label="Species") - source = StringField(label="Gene ID source") - - def run(self, inputs, outputs): - """Run the analysis.""" - sample_name = inputs.reads.entity_name - if sample_name.endswith(".fastq.gz"): - sample_name = sample_name[:-9] - - dir_fastqs = "./fastqs" - os.mkdir(dir_fastqs) - - # Format cellranger count fastq input so it follows the correct naming convention and - # folder structure - for i, fastqs in enumerate( - zip(inputs.reads.output.barcodes, inputs.reads.output.reads) - ): - os.symlink( - fastqs[0].path, - os.path.join( - dir_fastqs, - "{}_S1_L{}_R1_001.fastq.gz".format( - sample_name, - str(i + 1).zfill(3), - ), - ), - ) - os.symlink( - fastqs[1].path, - os.path.join( - dir_fastqs, - "{}_S1_L{}_R2_001.fastq.gz".format( - sample_name, - str(i + 1).zfill(3), - ), - ), - ) - - cmd = Cmd["cellranger"]["count"] - cmd = cmd["--id={}".format(sample_name)] - cmd = cmd["--fastqs={}".format(dir_fastqs)] - cmd = cmd[ - "--transcriptome={}".format(inputs.genome_index.output.genome_index.path) - ] - cmd = cmd["--localcores={}".format(self.requirements.resources.cores)] - cmd = cmd[ - "--localmem={}".format(int(self.requirements.resources.memory * 0.9 / 1024)) - ] - cmd = cmd["--chemistry={}".format(inputs.chemistry)] - cmd = cmd["--expect-cells={}".format(inputs.expected_cells)] - if inputs.trim_r1: - cmd = cmd["--r1-length={}".format(inputs.trim_r1)] - if inputs.trim_r2: - cmd = cmd["--r2-length={}".format(inputs.trim_r2)] - if inputs.force_cells: - cmd = cmd["--force-cells={}".format(inputs.force_cells)] - return_code, _, _ = cmd & TEE(retcode=None) - if return_code: - self.error("Error while running cellranger count.") - - output_dir = Path(f"{sample_name}/outs") - - report_file = "report_summary.html" - copy(output_dir / "web_summary.html", report_file) - - filtered_dir = Path("filtered_feature_bc_matrix") - raw_dir = Path("raw_feature_bc_matrix") - copytree(output_dir / filtered_dir, filtered_dir) - copytree(output_dir / raw_dir, raw_dir) - - outputs.matrix_filtered = str(filtered_dir / "matrix.mtx.gz") - outputs.genes_filtered = str(filtered_dir / "features.tsv.gz") - outputs.barcodes_filtered = str(filtered_dir / "barcodes.tsv.gz") - outputs.matrix_raw = str(raw_dir / "matrix.mtx.gz") - outputs.genes_raw = str(raw_dir / "features.tsv.gz") - outputs.barcodes_raw = str(raw_dir / "barcodes.tsv.gz") - outputs.report = report_file - outputs.build = inputs.genome_index.output.build - outputs.species = inputs.genome_index.output.species - outputs.source = inputs.genome_index.output.source - - # Spawn upload-bam process - bam_path = output_dir / "possorted_genome_bam.bam" - bai_path = output_dir / "possorted_genome_bam.bam.bai" - copy(bam_path, f"{sample_name}.bam") - copy(bai_path, f"{sample_name}.bam.bai") - - process_inputs = { - "src": f"{sample_name}.bam", - "src2": f"{sample_name}.bam.bai", - "reads": inputs.reads.id, - "species": inputs.genome_index.output.species, - "build": inputs.genome_index.output.build, - } - self.run_process("upload-bam-scseq-indexed", process_inputs) diff --git a/resolwe_bio/tests/files/10x_S1_L001_R1_001.fastq.gz b/resolwe_bio/tests/files/10x_S1_L001_R1_001.fastq.gz deleted file mode 100644 index 90d4a656b..000000000 Binary files a/resolwe_bio/tests/files/10x_S1_L001_R1_001.fastq.gz and /dev/null differ diff --git a/resolwe_bio/tests/files/10x_S1_L001_R2_001.fastq.gz b/resolwe_bio/tests/files/10x_S1_L001_R2_001.fastq.gz deleted file mode 100644 index 944793321..000000000 Binary files a/resolwe_bio/tests/files/10x_S1_L001_R2_001.fastq.gz and /dev/null differ diff --git a/resolwe_bio/tests/files/10x_S1_L002_R1_001.fastq.gz b/resolwe_bio/tests/files/10x_S1_L002_R1_001.fastq.gz deleted file mode 100644 index 01bc3971a..000000000 Binary files a/resolwe_bio/tests/files/10x_S1_L002_R1_001.fastq.gz and /dev/null differ diff --git a/resolwe_bio/tests/files/10x_S1_L002_R2_001.fastq.gz b/resolwe_bio/tests/files/10x_S1_L002_R2_001.fastq.gz deleted file mode 100644 index b15afc398..000000000 Binary files a/resolwe_bio/tests/files/10x_S1_L002_R2_001.fastq.gz and /dev/null differ diff --git a/resolwe_bio/tests/files/10x_scseq_matrix.mtx.gz b/resolwe_bio/tests/files/10x_scseq_matrix.mtx.gz deleted file mode 100644 index 79e12da23..000000000 Binary files a/resolwe_bio/tests/files/10x_scseq_matrix.mtx.gz and /dev/null differ diff --git a/resolwe_bio/tests/files/10x_scseq_stats.txt b/resolwe_bio/tests/files/10x_scseq_stats.txt deleted file mode 100644 index 70e5c0392..000000000 --- a/resolwe_bio/tests/files/10x_scseq_stats.txt +++ /dev/null @@ -1,16 +0,0 @@ -10578 + 0 in total (QC-passed reads + QC-failed reads) -4687 + 0 primary -5891 + 0 secondary -0 + 0 supplementary -0 + 0 duplicates -0 + 0 primary duplicates -10578 + 0 mapped (100.00% : N/A) -4687 + 0 primary mapped (100.00% : N/A) -0 + 0 paired in sequencing -0 + 0 read1 -0 + 0 read2 -0 + 0 properly paired (N/A : N/A) -0 + 0 with itself and mate mapped -0 + 0 singletons (N/A : N/A) -0 + 0 with mate mapped to a different chr -0 + 0 with mate mapped to a different chr (mapQ>=5) diff --git a/resolwe_bio/tests/files/HS chr21_short_ensembl.fasta.gz b/resolwe_bio/tests/files/HS chr21_short_ensembl.fasta.gz deleted file mode 100644 index 606cf806a..000000000 Binary files a/resolwe_bio/tests/files/HS chr21_short_ensembl.fasta.gz and /dev/null differ diff --git a/resolwe_bio/tests/files/HS chr21_short_ensembl.gtf.gz b/resolwe_bio/tests/files/HS chr21_short_ensembl.gtf.gz deleted file mode 100644 index 588576aa1..000000000 Binary files a/resolwe_bio/tests/files/HS chr21_short_ensembl.gtf.gz and /dev/null differ diff --git a/resolwe_bio/tests/processes/test_scseq.py b/resolwe_bio/tests/processes/test_scseq.py deleted file mode 100644 index 368149b2b..000000000 --- a/resolwe_bio/tests/processes/test_scseq.py +++ /dev/null @@ -1,78 +0,0 @@ -from resolwe.flow.models import Data -from resolwe.test import tag_process - -from resolwe_bio.utils.test import BioProcessTestCase - - -class ScSeqProcessorTestCase(BioProcessTestCase): - @tag_process("cellranger-mkref") - def test_cellranger_mkref(self): - with self.preparation_stage(): - annotation = self.prepare_annotation( - fn="HS chr21_short_ensembl.gtf.gz", - source="ENSEMBL", - species="Homo sapiens", - build="GRCh38.93", - ) - inputs = { - "src": "HS chr21_short_ensembl.fasta.gz", - "species": "Homo sapiens", - "build": "GRCh38.93", - } - genome = self.run_process("upload-fasta-nucl", inputs) - - inputs = {"annotation": annotation.id, "genome": genome.id} - mkref = self.run_process("cellranger-mkref", inputs) - - self.assertAlmostEqual(mkref.output["genome_index"]["size"], 1429968, delta=50) - self.assertFields(mkref, "build", "GRCh38.93") - self.assertFields(mkref, "species", "Homo sapiens") - self.assertFields(mkref, "source", "ENSEMBL") - - @tag_process("cellranger-count") - def test_cellranger_count(self): - with self.preparation_stage(): - annotation = self.prepare_annotation( - fn="HS chr21_short_ensembl.gtf.gz", - source="ENSEMBL", - species="Homo sapiens", - build="GRCh38.93", - ) - inputs = { - "src": "HS chr21_short_ensembl.fasta.gz", - "species": "Homo sapiens", - "build": "GRCh38.93", - } - genome = self.run_process("upload-fasta-nucl", inputs) - - inputs = {"annotation": annotation.id, "genome": genome.id} - genome_index = self.run_process("cellranger-mkref", inputs) - - inputs = { - "barcodes": [ - "10x_S1_L001_R1_001.fastq.gz", - "10x_S1_L002_R1_001.fastq.gz", - ], - "reads": ["10x_S1_L001_R2_001.fastq.gz", "10x_S1_L002_R2_001.fastq.gz"], - } - reads = self.run_process("upload-sc-10x", inputs) - - inputs = { - "reads": reads.id, - "genome_index": genome_index.id, - } - count = self.run_process("cellranger-count", inputs) - self.assertFile( - count, "matrix_filtered", "10x_scseq_matrix.mtx.gz", compression="gzip" - ) - self.assertFields(count, "build", "GRCh38.93") - self.assertFields(count, "species", "Homo sapiens") - self.assertFields(count, "source", "ENSEMBL") - - # Test 'upload-bam-scseq-indexed' process - bam = Data.objects.last() - self.assertFileExists(bam, "bam") - self.assertFileExists(bam, "bai") - self.assertFile(bam, "stats", "10x_scseq_stats.txt") - self.assertFields(bam, "species", "Homo sapiens") - self.assertFields(bam, "build", "GRCh38.93") diff --git a/resolwe_bio/tests/processes/test_upload.py b/resolwe_bio/tests/processes/test_upload.py index 1ddd4e2cd..701504523 100644 --- a/resolwe_bio/tests/processes/test_upload.py +++ b/resolwe_bio/tests/processes/test_upload.py @@ -939,35 +939,6 @@ def test_upload_gtf(self): self.assertFields(upload_gtf, "species", "Homo Sapiens") self.assertFields(upload_gtf, "build", "hg19") - @tag_process("upload-sc-10x") - def test_upload_sc_reads(self): - inputs = { - "barcodes": ["10x_S1_L001_R1_001.fastq.gz", "10x_S1_L002_R1_001.fastq.gz"], - "reads": ["10x_S1_L001_R2_001.fastq.gz"], - } - wrong_mates = self.run_process("upload-sc-10x", inputs, Data.STATUS_ERROR) - error_msg = ["The number of reads and barcodes fastqs must be the same."] - self.assertEqual(wrong_mates.process_error, error_msg) - - inputs = { - "barcodes": ["10x_S1_L001_R1_001.fastq.gz", "10x_S1_L002_R1_001.fastq.gz"], - "reads": ["10x_S1_L001_R2_001.fastq.gz", "10x_S1_L002_R2_001.fastq.gz"], - } - reads = self.run_process("upload-sc-10x", inputs) - - self.assertFiles( - reads, - "barcodes", - ["10x_S1_L001_R1_001.fastq.gz", "10x_S1_L002_R1_001.fastq.gz"], - compression="gzip", - ) - self.assertFiles( - reads, - "reads", - ["10x_S1_L001_R2_001.fastq.gz", "10x_S1_L002_R2_001.fastq.gz"], - compression="gzip", - ) - @tag_process("upload-bedpe") def test_upload_bedpe(self): species = "Homo sapiens"