Skip to content

Commit

Permalink
Optimize resource usage in BBDuk and read
Browse files Browse the repository at this point in the history
import processes
  • Loading branch information
jkokosar committed Sep 30, 2024
1 parent 943428d commit d1097d8
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 19 deletions.
3 changes: 3 additions & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ Changed
to skip downstream/upstream and intergenic variants
- Change clinical diagnosis and annotation fields type to text
- When filtering variants do not return duplicated objects
- Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``,
``upload-fastq-single``, ``upload-fastq-paired``,
``files-to-fastq-single`` and ``files-to-fastq-paired``

Fixed
-----
Expand Down
49 changes: 38 additions & 11 deletions resolwe_bio/processes/import_data/seq_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def validate_fastq(fq, fq2=None):
return message


def run_fastqc(fastqs, output_dir):
def run_fastqc(fastqs, output_dir, cores):
"""Run fastQC on given FASTQs.
:param list fastqs: List of fastqs
Expand All @@ -144,6 +144,7 @@ def run_fastqc(fastqs, output_dir):
cmd = cmd[fastq]
cmd = cmd["--extract"]
cmd = cmd[f"--outdir={str(output_path)}"]
cmd = cmd["-t", cores]
_, _, stderr = cmd & TEE

return stderr
Expand Down Expand Up @@ -182,7 +183,7 @@ class UploadFastqSingle(Process):
slug = "upload-fastq-single"
name = "FASTQ file (single-end)"
process_type = "data:reads:fastq:single"
version = "2.6.0"
version = "2.7.0"
category = "Import"
data_name = '{{ src.0.file|default("?") }}'
scheduling_class = SchedulingClass.BATCH
Expand All @@ -196,7 +197,8 @@ class UploadFastqSingle(Process):
"docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"}
},
"resources": {
"cores": 1,
"cores": 2,
"memory": 2048,
"network": True,
},
}
Expand Down Expand Up @@ -251,7 +253,11 @@ def run(self, inputs, outputs):
shutil.copyfileobj(infile, outfile)
fastqgz = [fastqz]

stderr = run_fastqc([fastqgz], "./fastqc")
stderr = run_fastqc(
fastqs=[fastqgz],
output_dir="./fastqc",
cores=self.requirements.resources.cores,
)
if "Failed to process" in stderr or "Skipping" in stderr:
self.error("Failed while processing with FastQC.")

Expand Down Expand Up @@ -316,7 +322,7 @@ class UploadFastqPaired(Process):
slug = "upload-fastq-paired"
name = "FASTQ file (paired-end)"
process_type = "data:reads:fastq:paired"
version = "2.6.0"
version = "2.7.0"
category = "Import"
data_name = '{{ src1.0.file|default("?") }}'
scheduling_class = SchedulingClass.BATCH
Expand All @@ -330,7 +336,8 @@ class UploadFastqPaired(Process):
"docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"}
},
"resources": {
"cores": 1,
"cores": 2,
"memory": 2048,
"network": True,
},
}
Expand Down Expand Up @@ -468,7 +475,11 @@ def run(self, inputs, outputs):
}
]

stderr = run_fastqc(mate1_fastqgz + mate2_fastqgz, "./fastqc")
stderr = run_fastqc(
fastqs=mate1_fastqgz + mate2_fastqgz,
output_dir="./fastqc",
cores=self.requirements.resources.cores,
)
if "Failed to process" in stderr or "Skipping" in stderr:
self.error("Failed while processing with FastQC.")

Expand Down Expand Up @@ -517,7 +528,7 @@ class FilesToFastqSingle(Process):
slug = "files-to-fastq-single"
name = "Convert files to reads (single-end)"
process_type = "data:reads:fastq:single"
version = "1.6.0"
version = "1.7.0"
category = "Import"
data_name = "Files to FASTQ single-end ({{ (src|first).file.file }})"
scheduling_class = SchedulingClass.BATCH
Expand All @@ -530,6 +541,10 @@ class FilesToFastqSingle(Process):
"executor": {
"docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"}
},
"resources": {
"cores": 2,
"memory": 2048,
},
}

class Input:
Expand Down Expand Up @@ -588,7 +603,11 @@ def run(self, inputs, outputs):
shutil.copyfileobj(infile, outfile)
fastqgz = [fastqz]

stderr = run_fastqc([fastqgz], "./fastqc")
stderr = run_fastqc(
fastqs=[fastqgz],
output_dir="./fastqc",
cores=self.requirements.resources.cores,
)
if "Failed to process" in stderr or "Skipping" in stderr:
self.error("Failed while processing with FastQC.")

Expand Down Expand Up @@ -649,7 +668,7 @@ class FilesToFastqPaired(Process):
slug = "files-to-fastq-paired"
name = "Convert files to reads (paired-end)"
process_type = "data:reads:fastq:paired"
version = "1.6.0"
version = "1.7.0"
category = "Import"
data_name = "Files to FASTQ paired-end ({{ (src1|first).file.file }}, {{(src2|first).file.file}})"
scheduling_class = SchedulingClass.BATCH
Expand All @@ -662,6 +681,10 @@ class FilesToFastqPaired(Process):
"executor": {
"docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"}
},
"resources": {
"cores": 2,
"memory": 2048,
},
}

class Input:
Expand Down Expand Up @@ -807,7 +830,11 @@ def run(self, inputs, outputs):
}
]

stderr = run_fastqc(mate1_fastqgz + mate2_fastqgz, "./fastqc")
stderr = run_fastqc(
fastqs=mate1_fastqgz + mate2_fastqgz,
output_dir="./fastqc",
cores=self.requirements.resources.cores,
)
if "Failed to process" in stderr or "Skipping" in stderr:
self.error("Failed while processing with FastQC.")

Expand Down
31 changes: 23 additions & 8 deletions resolwe_bio/processes/reads_processing/bbduk.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class BBDukSingle(Process):
slug = "bbduk-single"
name = "BBDuk (single-end)"
process_type = "data:reads:fastq:single:bbduk"
version = "3.1.2"
version = "3.2.0"
category = "FASTQ processing"
data_name = "{{ reads|name|default('?') }}"
scheduling_class = SchedulingClass.BATCH
Expand All @@ -164,8 +164,8 @@ class BBDukSingle(Process):
"docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"}
},
"resources": {
"cores": 4,
"memory": 8192,
"cores": 8,
"memory": 4096,
},
}

Expand Down Expand Up @@ -695,7 +695,12 @@ def run(self, inputs, outputs):
rename_preprocessed_files(input_files=input_reads)

fastqgz = [fastq["original_name"] for fastq in input_reads]
fastqc_inputs = fastqgz + ["--extract", f"--outdir={str(output_path)}"]
fastqc_inputs = fastqgz + [
"--extract",
f"--outdir={str(output_path)}",
"-t",
min(self.requirements.resources.cores, num_of_lanes),
]

if inputs.fastqc.nogroup:
fastqc_inputs.append("--no-group")
Expand Down Expand Up @@ -758,8 +763,8 @@ class BBDukPaired(Process):
"docker": {"image": "public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0"}
},
"resources": {
"cores": 4,
"memory": 8192,
"cores": 8,
"memory": 4096,
},
}

Expand Down Expand Up @@ -1329,8 +1334,18 @@ def run(self, inputs, outputs):

fastqgz = [fastq["original_name"] for fastq in input_reads]
fastqgz2 = [fastq["original_name2"] for fastq in input_reads]
fastqc_inputs = fastqgz + ["--extract", f"--outdir={str(output_path)}"]
fastqc2_inputs = fastqgz2 + ["--extract", f"--outdir={str(output_path)}"]
fastqc_inputs = fastqgz + [
"--extract",
f"--outdir={str(output_path)}",
"-t",
min(self.requirements.resources.cores, num_of_lanes),
]
fastqc2_inputs = fastqgz2 + [
"--extract",
f"--outdir={str(output_path)}",
"-t",
min(self.requirements.resources.cores, num_of_lanes),
]

if inputs.fastqc.nogroup:
fastqc_inputs.append("--no-group")
Expand Down

0 comments on commit d1097d8

Please sign in to comment.