From d1097d8c3edf76f752cdf90bb71c1d1255ce54c6 Mon Sep 17 00:00:00 2001 From: Janez Kokosar Date: Fri, 20 Sep 2024 12:48:44 +0200 Subject: [PATCH] Optimize resource usage in BBDuk and read import processes --- docs/CHANGELOG.rst | 3 ++ .../processes/import_data/seq_reads.py | 49 ++++++++++++++----- .../processes/reads_processing/bbduk.py | 31 +++++++++--- 3 files changed, 64 insertions(+), 19 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index b51b1acfd..03fc74975 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -25,6 +25,9 @@ Changed to skip downstream/upstream and intergenic variants - Change clinical diagnosis and annotation fields type to text - When filtering variants do not return duplicated objects +- Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``, + ``upload-fastq-single``, ``upload-fastq-paired``, + ``files-to-fastq-single`` and ``files-to-fastq-paired`` Fixed ----- diff --git a/resolwe_bio/processes/import_data/seq_reads.py b/resolwe_bio/processes/import_data/seq_reads.py index 3fdd1dd02..9cdf961e2 100644 --- a/resolwe_bio/processes/import_data/seq_reads.py +++ b/resolwe_bio/processes/import_data/seq_reads.py @@ -129,7 +129,7 @@ def validate_fastq(fq, fq2=None): return message -def run_fastqc(fastqs, output_dir): +def run_fastqc(fastqs, output_dir, cores): """Run fastQC on given FASTQs. :param list fastqs: List of fastqs @@ -144,6 +144,7 @@ def run_fastqc(fastqs, output_dir): cmd = cmd[fastq] cmd = cmd["--extract"] cmd = cmd[f"--outdir={str(output_path)}"] + cmd = cmd["-t", cores] _, _, stderr = cmd & TEE return stderr @@ -182,7 +183,7 @@ class UploadFastqSingle(Process): slug = "upload-fastq-single" name = "FASTQ file (single-end)" process_type = "data:reads:fastq:single" - version = "2.6.0" + version = "2.7.0" category = "Import" data_name = '{{ src.0.file|default("?") }}' scheduling_class = SchedulingClass.BATCH @@ -196,7 +197,8 @@ class UploadFastqSingle(Process): "docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"} }, "resources": { - "cores": 1, + "cores": 2, + "memory": 2048, "network": True, }, } @@ -251,7 +253,11 @@ def run(self, inputs, outputs): shutil.copyfileobj(infile, outfile) fastqgz = [fastqz] - stderr = run_fastqc([fastqgz], "./fastqc") + stderr = run_fastqc( + fastqs=[fastqgz], + output_dir="./fastqc", + cores=self.requirements.resources.cores, + ) if "Failed to process" in stderr or "Skipping" in stderr: self.error("Failed while processing with FastQC.") @@ -316,7 +322,7 @@ class UploadFastqPaired(Process): slug = "upload-fastq-paired" name = "FASTQ file (paired-end)" process_type = "data:reads:fastq:paired" - version = "2.6.0" + version = "2.7.0" category = "Import" data_name = '{{ src1.0.file|default("?") }}' scheduling_class = SchedulingClass.BATCH @@ -330,7 +336,8 @@ class UploadFastqPaired(Process): "docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"} }, "resources": { - "cores": 1, + "cores": 2, + "memory": 2048, "network": True, }, } @@ -468,7 +475,11 @@ def run(self, inputs, outputs): } ] - stderr = run_fastqc(mate1_fastqgz + mate2_fastqgz, "./fastqc") + stderr = run_fastqc( + fastqs=mate1_fastqgz + mate2_fastqgz, + output_dir="./fastqc", + cores=self.requirements.resources.cores, + ) if "Failed to process" in stderr or "Skipping" in stderr: self.error("Failed while processing with FastQC.") @@ -517,7 +528,7 @@ class FilesToFastqSingle(Process): slug = "files-to-fastq-single" name = "Convert files to reads (single-end)" process_type = "data:reads:fastq:single" - version = "1.6.0" + version = "1.7.0" category = "Import" data_name = "Files to FASTQ single-end ({{ (src|first).file.file }})" scheduling_class = SchedulingClass.BATCH @@ -530,6 +541,10 @@ class FilesToFastqSingle(Process): "executor": { "docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"} }, + "resources": { + "cores": 2, + "memory": 2048, + }, } class Input: @@ -588,7 +603,11 @@ def run(self, inputs, outputs): shutil.copyfileobj(infile, outfile) fastqgz = [fastqz] - stderr = run_fastqc([fastqgz], "./fastqc") + stderr = run_fastqc( + fastqs=[fastqgz], + output_dir="./fastqc", + cores=self.requirements.resources.cores, + ) if "Failed to process" in stderr or "Skipping" in stderr: self.error("Failed while processing with FastQC.") @@ -649,7 +668,7 @@ class FilesToFastqPaired(Process): slug = "files-to-fastq-paired" name = "Convert files to reads (paired-end)" process_type = "data:reads:fastq:paired" - version = "1.6.0" + version = "1.7.0" category = "Import" data_name = "Files to FASTQ paired-end ({{ (src1|first).file.file }}, {{(src2|first).file.file}})" scheduling_class = SchedulingClass.BATCH @@ -662,6 +681,10 @@ class FilesToFastqPaired(Process): "executor": { "docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"} }, + "resources": { + "cores": 2, + "memory": 2048, + }, } class Input: @@ -807,7 +830,11 @@ def run(self, inputs, outputs): } ] - stderr = run_fastqc(mate1_fastqgz + mate2_fastqgz, "./fastqc") + stderr = run_fastqc( + fastqs=mate1_fastqgz + mate2_fastqgz, + output_dir="./fastqc", + cores=self.requirements.resources.cores, + ) if "Failed to process" in stderr or "Skipping" in stderr: self.error("Failed while processing with FastQC.") diff --git a/resolwe_bio/processes/reads_processing/bbduk.py b/resolwe_bio/processes/reads_processing/bbduk.py index 6176cad5a..30de96144 100644 --- a/resolwe_bio/processes/reads_processing/bbduk.py +++ b/resolwe_bio/processes/reads_processing/bbduk.py @@ -150,7 +150,7 @@ class BBDukSingle(Process): slug = "bbduk-single" name = "BBDuk (single-end)" process_type = "data:reads:fastq:single:bbduk" - version = "3.1.2" + version = "3.2.0" category = "FASTQ processing" data_name = "{{ reads|name|default('?') }}" scheduling_class = SchedulingClass.BATCH @@ -164,8 +164,8 @@ class BBDukSingle(Process): "docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"} }, "resources": { - "cores": 4, - "memory": 8192, + "cores": 8, + "memory": 4096, }, } @@ -695,7 +695,12 @@ def run(self, inputs, outputs): rename_preprocessed_files(input_files=input_reads) fastqgz = [fastq["original_name"] for fastq in input_reads] - fastqc_inputs = fastqgz + ["--extract", f"--outdir={str(output_path)}"] + fastqc_inputs = fastqgz + [ + "--extract", + f"--outdir={str(output_path)}", + "-t", + min(self.requirements.resources.cores, num_of_lanes), + ] if inputs.fastqc.nogroup: fastqc_inputs.append("--no-group") @@ -758,8 +763,8 @@ class BBDukPaired(Process): "docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"} }, "resources": { - "cores": 4, - "memory": 8192, + "cores": 8, + "memory": 4096, }, } @@ -1329,8 +1334,18 @@ def run(self, inputs, outputs): fastqgz = [fastq["original_name"] for fastq in input_reads] fastqgz2 = [fastq["original_name2"] for fastq in input_reads] - fastqc_inputs = fastqgz + ["--extract", f"--outdir={str(output_path)}"] - fastqc2_inputs = fastqgz2 + ["--extract", f"--outdir={str(output_path)}"] + fastqc_inputs = fastqgz + [ + "--extract", + f"--outdir={str(output_path)}", + "-t", + min(self.requirements.resources.cores, num_of_lanes), + ] + fastqc2_inputs = fastqgz2 + [ + "--extract", + f"--outdir={str(output_path)}", + "-t", + min(self.requirements.resources.cores, num_of_lanes), + ] if inputs.fastqc.nogroup: fastqc_inputs.append("--no-group")