diff --git a/cutadapt/Dockerfile b/cutadapt/Dockerfile index 03a5cd9..5be3c0a 100644 --- a/cutadapt/Dockerfile +++ b/cutadapt/Dockerfile @@ -7,7 +7,7 @@ # Set the base image to Ubuntu FROM ubuntu:20.04 -ARG PACKAGE_VERSION=2.10 +ARG PACKAGE_VERSION=4.0 ARG BUILD_PACKAGES="build-essential" ARG DEBIAN_FRONTEND=noninteractive diff --git a/cutadapt/README.md b/cutadapt/README.md index 857bea7..cc8994d 100644 --- a/cutadapt/README.md +++ b/cutadapt/README.md @@ -31,11 +31,10 @@ nextflow run main.nf -params-file params.json **Required**: * `input_R1`: Read1 fastq file. This is assumed to be in a gzipped form. -* `input_R2`: Read2 fastq file. This is assumed to be in a gzipped form. * `publish_dir`: the final location for the results. **Optional**: - +* `input_R2`: Read2 fastq file. This is assumed to be in a gzipped form. Default: No_File, which forces cutadapt to run for single end * `read1_adapter` : Override the adapter for read 1. Default: "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" * `read2_adapter` : Override the adapter for read 2. Default: "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" * `min_length` : set the minimum length permitted for reads. Default: 1 @@ -61,10 +60,7 @@ Examples of these files are located in [tests/expected](tests/expected). ### Metrics Current metrics captured in the qc_metrics file include: -* `adapter_read1_percent `: the percent of read 1's that contained adapter sequence. -* `adapter_read2_percent`: the percent of read 2's that contained adapter sequence. -* `quality_trimmed_percent`: the percent of reads that were trimmed for quality. - By default, this should be 0, because by default, quality trimming is turned off. +* `adapter_percent `: the percent of reads that contained adapter sequence. ## Local Testing @@ -79,7 +75,7 @@ wfpm workon cutadapt 3. Build the Docker container locally ``` cd cutadapt -docker build -t ghcr.io/icgc-argo-qc-wg/argo-qc-tools.cutadapt:3.4.0 . +docker build -t ghcr.io/icgc-argo-qc-workflows/argo-qc-tools.cutadapt:0.2.1 . ``` 4. Run the tests ``` @@ -92,6 +88,7 @@ If everything works correctly, you should see something like the following: Validating package: /Users/mtaschuk/git/argo-qc-tools/cutadapt Pakcage valid. Testing package: /Users/mtaschuk/git/argo-qc-tools/cutadapt -[1/1] Testing: /Users/mtaschuk/git/argo-qc-tools/cutadapt/tests/test-job-1.json. PASSED -Tested package: cutadapt, PASSED: 1, FAILED: 0 +[1/2] Testing: /Users/mtaschuk/git/argo-qc-tools/cutadapt/tests/test-job-1.json. PASSED +[2/2] Testing: /Users/mtaschuk/git/argo-qc-tools/cutadapt/tests/test-job-2.json. PASSED +Tested package: cutadapt, PASSED: 2, FAILED: 0 ``` diff --git a/cutadapt/main.nf b/cutadapt/main.nf index d235bf1..7bd62ba 100755 --- a/cutadapt/main.nf +++ b/cutadapt/main.nf @@ -29,7 +29,7 @@ /* this block is auto-generated based on info from pkg.json where */ /* changes can be made if needed, do NOT modify this block manually */ nextflow.enable.dsl = 2 -version = '0.1.1' // package version, changed from 3.4.0 so it doesnt match cutadapt +version = '0.2.0' // package version, changed from 3.4.0 so it doesnt match cutadapt container = [ 'ghcr.io': 'ghcr.io/icgc-argo-workflows/argo-qc-tools.cutadapt' @@ -49,9 +49,9 @@ params.publish_dir = "" // set to empty string will disable publishDir // tool specific params go here, add / change as needed +params.read_group_id = "" params.input_R1="" -params.input_R2="" -params.output_pattern = "*.cutadapt.log.qc.tgz" // output file name pattern +params.input_R2="No_File" params.read1_adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" params.read2_adapter="AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" params.min_length=1 @@ -60,31 +60,35 @@ params.extra_options="" process cutadapt { container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}" - publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir + publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir ? true : false cpus params.cpus memory "${params.mem} GB" input: - path input_R1 - path input_R2 + tuple val(read_group_id), path(input_R1), path(input_R2) output: - path "output_dir/${params.output_pattern}", emit: output_tgz + path "output_dir/*.cutadapt.tgz", emit: cutadapt_tar + path "output_dir/*.cutadapt.log", emit: cutadapt_log + path "output_dir/*{fq,fastq,fq.gz,fastq.gz}", emit: cutadapt_results script: // add and initialize variables here as needed + arg_input_R2 = input_R2.name != 'No_File' ? "-2 ${input_R2}" : "" + """ mkdir -p output_dir main.py \ - -1 ${input_R1} -2 ${input_R2} \ + -1 ${input_R1} \ + -r ${read_group_id} \ -o output_dir \ -a ${params.read1_adapter} \ -A ${params.read2_adapter} \ -m ${params.min_length} \ - -q ${params.qual_cutoff} ${params.extra_options} + -q ${params.qual_cutoff} ${arg_input_R2} ${params.extra_options} """ } @@ -93,7 +97,6 @@ process cutadapt { // using this command: nextflow run icgc-argo-workflows/argo-qc-tools/cutadapt/main.nf -r cutadapt.v3.4.0 --params-file workflow { cutadapt( - file(params.input_R1), - file(params.input_R2) + tuple(params.read_group_id, file(params.input_R1), file(params.input_R2)) ) } diff --git a/cutadapt/main.py b/cutadapt/main.py index c39b8af..d2d23d5 100755 --- a/cutadapt/main.py +++ b/cutadapt/main.py @@ -33,6 +33,7 @@ import re import json import tarfile +import hashlib def run_cmd(cmd): proc = subprocess.Popen( @@ -61,7 +62,7 @@ def prepare_tarball(qc_metrics, logfile): files_to_tar = ['tar_content.json', qc_metrics, logfile] - tarfile_name = f"{os.path.dirname(logfile)}/{os.path.basename(logfile)}.qc.tgz" + tarfile_name = re.sub(r'.log$', r'.tgz', logfile) with tarfile.open(tarfile_name, "w:gz") as tar: for f in files_to_tar: tar.add(f, arcname=os.path.basename(f)) @@ -82,15 +83,13 @@ def prep_qc_metrics(cutadapt_log, tool_ver): }, 'metrics': {} } + with open(cutadapt_log,'r') as l: log=l.read() - r1_adapt=re.search("Read 1 with adapter:\s+\d+.+(\d+\.\d+)%",log) - r2_adapt=re.search("Read 2 with adapter:\s+\d+.+(\d+\.\d+)%",log) - q_trim=re.search("Quality-trimmed:\s+\d+.+(\d+\.\d+)%",log) - - qc_metrics['metrics']['adapter_read1_percent']=float(r1_adapt.group(1)) - qc_metrics['metrics']['adapter_read2_percent']=float(r2_adapt.group(1)) - qc_metrics['metrics']['quality_trimmed_percent']=float(q_trim.group(1)) + adapt=re.search("Read 1 with adapter:\s+\d+.+(\d+\.\d+)%",log) + if adapt is None: + adapt=re.search("Reads with adapters:\s+\d+.+(\d+\.\d+)%",log) + qc_metrics['metrics']['adapter_percent']=float(adapt.group(1)) qc_metrics_file = f"{os.path.dirname(cutadapt_log)}/{os.path.basename(cutadapt_log)}.qc_metrics.json" with open(qc_metrics_file, "w") as j: @@ -110,7 +109,9 @@ def main(): parser.add_argument('-1', '--input-R1', dest='input_R1', type=str, help='Input file read 1', required=True) parser.add_argument('-2', '--input-R2', dest='input_R2', type=str, - help='Input file read 2', required=True) + help='Input file read 2') + parser.add_argument('-r', '--rg_id', dest='rg_id', type=str, + help='Read group ID', required=True) parser.add_argument('-o', '--output-dir', dest='output_dir', type=str, help='Output directory', required=True) parser.add_argument('-a', '--read1-adapter', dest='adapter_R1', type=str, @@ -130,18 +131,27 @@ def main(): if not os.path.isfile(args.input_R1): sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_R1) - if not os.path.isfile(args.input_R2): + if args.input_R2 and not os.path.isfile(args.input_R2): sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_R2) if not os.path.isdir(args.output_dir): sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir) - basename=os.path.basename(args.input_R1) - index_of_dot=basename.index('.') - base=basename[:index_of_dot] + basename_R1=os.path.basename(args.input_R1) + + if args.input_R2: + basename_R2=os.path.basename(args.input_R2) + cmd = f"cutadapt -q {args.min_trim_qual} -m {args.min_trim_len} -a {args.adapter_R1} -A {args.adapter_R2} -o {args.output_dir}/trim_{basename_R1} -p {args.output_dir}/trim_{basename_R2} {args.input_R1} {args.input_R2}" + else: + cmd = f"cutadapt -q {args.min_trim_qual} -m {args.min_trim_len} -a {args.adapter_R1} -o {args.output_dir}/trim_{basename_R1} {args.input_R1}" + - stdout, stderr, returncode = run_cmd(f"cutadapt -q {args.min_trim_qual} -m {args.min_trim_len} -a {args.adapter_R1} -A {args.adapter_R2} -o {args.output_dir}/out.fastq.gz -p {args.output_dir}/out2.fastq.gz {args.input_R1} {args.input_R2}") + stdout, stderr, returncode = run_cmd(cmd) + # in case the rg_id contains filename not friendly characters + friendly_rgid = "".join([ c if re.match(r"[a-zA-Z0-9\.\-_]", c) else "_" for c in args.rg_id ]) + # calculate md5 and add it in the logfile name to avoid name colision + md5sum = hashlib.md5((args.rg_id).encode('utf-8')).hexdigest() - logfile=f"{args.output_dir}/{base}.cutadapt.log" + logfile=f"{args.output_dir}/{friendly_rgid}.{md5sum}.cutadapt.log" with open(logfile,"w") as log: log.write(stdout) diff --git a/cutadapt/pkg.json b/cutadapt/pkg.json index d84486e..44d154d 100644 --- a/cutadapt/pkg.json +++ b/cutadapt/pkg.json @@ -1,6 +1,6 @@ { "name": "cutadapt", - "version": "0.1.1", + "version": "0.2.0", "description": "CutAdapt tool", "main": "main.nf", "deprecated": false, @@ -37,4 +37,4 @@ "license": "MIT", "bugReport": "https://github.com/icgc-argo-workflows/argo-qc-tools/issues", "homepage": "https://github.com/icgc-argo-workflows/argo-qc-tools#readme" -} +} \ No newline at end of file diff --git a/cutadapt/tests/checker.nf b/cutadapt/tests/checker.nf index dd64844..c80eb48 100755 --- a/cutadapt/tests/checker.nf +++ b/cutadapt/tests/checker.nf @@ -27,7 +27,7 @@ /* this block is auto-generated based on info from pkg.json where */ /* changes can be made if needed, do NOT modify this block manually */ nextflow.enable.dsl = 2 -version = '0.1.1' // package version +version = '0.2.0' // package version container = [ 'ghcr.io': 'ghcr.io/icgc-argo-workflows/argo-qc-tools.cutadapt' ] @@ -38,8 +38,11 @@ params.container_registry = "" params.container_version = "" params.container = "" // tool specific parmas go here, add / change as needed -params.input_file = "" +params.read_group_id = "" +params.input_R1="" +params.input_R2="No_File" params.expected_output = "" + include { cutadapt } from '../main' process file_smart_diff { @@ -56,7 +59,7 @@ process file_smart_diff { tar xvf !{output_tgz} -C actual tar xvf !{expected_tgz} -C expected - export NAME=`basename !{output_tgz} .cutadapt.log.qc.tgz` + export NAME=`basename !{output_tgz} .cutadapt.tgz` diff actual/tar_content.json expected/tar_content.json \ && ( echo "TAR_CONTENT: Test PASSED" && exit 0 ) || ( echo "TAR_CONTENT: Test FAILED. tar_content.json files do not match" && exit 1 ) @@ -74,20 +77,22 @@ process file_smart_diff { } workflow checker { take: + read_group_id input_R1 input_R2 expected_tgz main: cutadapt( - file(params.input_R1), file(params.input_R2) + tuple(read_group_id, input_R1, input_R2) ) file_smart_diff( - cutadapt.out.output_tgz, + cutadapt.out.cutadapt_tar, expected_tgz ) } workflow { checker( + params.read_group_id, file(params.input_R1), file(params.input_R2), file(params.expected_tgz) diff --git a/cutadapt/tests/expected/expected-TCRBOA7-T-RNA.cutadapt.log.qc.tgz b/cutadapt/tests/expected/expected-TCRBOA7-T-RNA.cutadapt.log.qc.tgz deleted file mode 100644 index 2fb7735..0000000 Binary files a/cutadapt/tests/expected/expected-TCRBOA7-T-RNA.cutadapt.log.qc.tgz and /dev/null differ diff --git a/cutadapt/tests/expected/expected.test-job-1.cutadapt.tgz b/cutadapt/tests/expected/expected.test-job-1.cutadapt.tgz new file mode 100644 index 0000000..c0dcfe8 Binary files /dev/null and b/cutadapt/tests/expected/expected.test-job-1.cutadapt.tgz differ diff --git a/cutadapt/tests/expected/expected.test-job-2.cutadapt.tgz b/cutadapt/tests/expected/expected.test-job-2.cutadapt.tgz new file mode 100644 index 0000000..c1bc90c Binary files /dev/null and b/cutadapt/tests/expected/expected.test-job-2.cutadapt.tgz differ diff --git a/cutadapt/tests/test-job-1.json b/cutadapt/tests/test-job-1.json index 76e96d0..21654a5 100644 --- a/cutadapt/tests/test-job-1.json +++ b/cutadapt/tests/test-job-1.json @@ -1,7 +1,8 @@ { + "read_group_id": "TEST-RNA", "input_R1": "input/TCRBOA7-T-RNA.250reads.read1.fastq.gz", "input_R2": "input/TCRBOA7-T-RNA.250reads.read2.fastq.gz", - "expected_tgz": "expected/expected-TCRBOA7-T-RNA.cutadapt.log.qc.tgz", + "expected_tgz": "expected/expected.test-job-1.cutadapt.tgz", "publish_dir": "outdir", "cpus": 1, "mem": 0.5 diff --git a/cutadapt/tests/test-job-2.json b/cutadapt/tests/test-job-2.json new file mode 100644 index 0000000..aad75e8 --- /dev/null +++ b/cutadapt/tests/test-job-2.json @@ -0,0 +1,8 @@ +{ + "read_group_id": "TEST-RNA", + "input_R1": "input/TCRBOA7-T-RNA.250reads.read1.fastq.gz", + "expected_tgz": "expected/expected.test-job-2.cutadapt.tgz", + "publish_dir": "outdir", + "cpus": 1, + "mem": 0.5 +}