From e1867c20a9a1efea85adb1d86c9d9a69b424bcf6 Mon Sep 17 00:00:00 2001 From: "Pavel V. Dimens" <19176506+pdimens@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:41:32 -0500 Subject: [PATCH] Improved singleton counting (#170) * fix name * keep stderr open for common convention * add useful script * update how singletons are counted * fix the last missing bc * alphanumeric check --- .github/workflows/tests.yml | 2 +- harpy/bin/bx_stats.py | 7 ++- harpy/bin/separate_singletons | 80 +++++++++++++++++++++++++++++++++ harpy/bin/separate_validbx | 6 +-- harpy/downsample.py | 4 +- harpy/reports/align_bxstats.Rmd | 7 +-- harpy/reports/align_stats.Rmd | 4 +- 7 files changed, 98 insertions(+), 12 deletions(-) create mode 100755 harpy/bin/separate_singletons diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a9d38045c..00953809b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -831,7 +831,7 @@ jobs: - name: harpy downsample bam shell: micromamba-shell {0} run: harpy downsample -d 1 --random-seed 699 --quiet test/bam/sample1.bam - - name: harpy downsample bam + - name: harpy downsample fastq shell: micromamba-shell {0} run: harpy downsample -d 1 --quiet test/fastq/sample1.* - name: harpy hpc diff --git a/harpy/bin/bx_stats.py b/harpy/bin/bx_stats.py index b46176a5d..fc889a026 100755 --- a/harpy/bin/bx_stats.py +++ b/harpy/bin/bx_stats.py @@ -141,8 +141,13 @@ def writestats(x, writechrom, destination): } else: # update the basic alignment info of the molecule + if read.is_forward: + # +1 for a forward read, whether it is paired or not + d[mi]["n"] += 1 + elif read.is_reverse and not read.is_paired: + # +1 for reverse only if it's unpaired, so the paired read doesn't count twice + d[mi]["n"] += 1 d[mi]["bp"] += bp - d[mi]["n"] += 1 d[mi]["insert_len"] += isize d[mi]["start"] = min(pos_start, d[mi]["start"]) d[mi]["end"] = max(pos_end, d[mi]["end"]) diff --git a/harpy/bin/separate_singletons b/harpy/bin/separate_singletons new file mode 100755 index 000000000..16e3e2d17 --- /dev/null +++ b/harpy/bin/separate_singletons @@ -0,0 +1,80 @@ +#! /usr/bin/env python + +import os +import re +import sys +import argparse +import subprocess +import pysam + +parser = argparse.ArgumentParser( + prog='separate_singletons', + description='Isolate singleton and non-singleton linked-read BAM records into separate files.', + usage = "separate_singletons -t threads -b barcode_tag -s singletons.bam input.bam > output.bam", + ) +parser.add_argument("-b", dest = "bx_tag", metavar = "barcode_tag", type=str, default = "BX", help="The header tag with the barcode (default: %(default)s)") +parser.add_argument("-s", dest = "singletons", metavar = "singletons_file", type=str, default = "singletons.bam", help="Name of output singleton file (default: %(default)s)") +parser.add_argument("-t", dest = "threads", metavar="threads", type=int, default = 4, help="Number of threads to use (default: %(default)s)") +parser.add_argument('input', type = str, help = "Input bam file") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + +args = parser.parse_args() +if args.threads <1: + parser.error("Threads supplied to -t ({args.threads}) must be positive (e.g. >1)") +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") +if len(args.bx_tag) != 2 or args.bx_tag.isalnum(): + parser.error(f"The header tag supplied to -b ({args.bx_tag}) must be alphanumeric and exactly two characters long") + +invalid_pattern = re.compile(r'[AaBbCcDd]00') +sorted_bam = f"{args.input[:-4]}.bxsort.bam" +subprocess.run(f"samtools sort -@ {args.threads} -o {sorted_bam} -t {args.bx_tag} {args.input}".split(), stderr=sys.stderr) +with ( + pysam.AlignmentFile(sorted_bam, "rb", check_sq=False) as infile, + pysam.AlignmentFile(sys.stdout, "wb", template=infile) as nonsingleton, + pysam.AlignmentFile(args.singletons, "wb", template=infile) as singleton, +): + record_store = [] + read_count = 0 + last_barcode = None + for record in infile: + try: + barcode = record.get_tag(args.bx_tag) + if isinstance(barcode, int): + pass # an int from an MI-tharype tag + elif invalid_pattern.search(barcode): + continue + except KeyError: + continue + # write the stored records when the barcode changes + if last_barcode and barcode != last_barcode: + if read_count > 1: + [nonsingleton.write(i) for i in record_store] + else: + [singleton.write(i) for i in record_store] + # reset the record store and read count + record_store = [] + read_count = 0 + + record_store.append(record) + if record.is_forward: + # +1 for a forward read, whether it is paired or not + read_count += 1 + elif record.is_reverse and not record.is_paired: + # +1 for reverse only if it's unpaired, so the paired read doesn't count twice + read_count += 1 + # update the last barcode with the current one + last_barcode = barcode + # After the for loop ends + if record_store: + if read_count > 1: + for i in record_store: + nonsingleton.write(i) + else: + for i in record_store: + singleton.write(i) + +# final housekeeping to remove intermediate +os.remove(sorted_bam) \ No newline at end of file diff --git a/harpy/bin/separate_validbx b/harpy/bin/separate_validbx index 795e7ff7c..b1b431c96 100755 --- a/harpy/bin/separate_validbx +++ b/harpy/bin/separate_validbx @@ -1,9 +1,9 @@ #! /usr/bin/env bash if [[ -z "$1" ]]; then - echo -e "\n Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes (stderr)." - echo -e "\n [usage] separate_validbx input.bam > valid.bam 2> invalid.bam" + echo -e "\n Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes." + echo -e "\n [usage] separate_validbx invalid.bam input.bam > valid.bam" exit fi -samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput /dev/stderr $1 \ No newline at end of file +samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput $1 $2 \ No newline at end of file diff --git a/harpy/downsample.py b/harpy/downsample.py index e0b2c8a1c..62beccb40 100644 --- a/harpy/downsample.py +++ b/harpy/downsample.py @@ -47,8 +47,8 @@ def downsample(input, prefix, downsample, invalid, bx_tag, random_seed, threads, - `drop`: don't output any invalid/missing barcodes """ # validate input files as either 1 bam or 2 fastq - if len(bx_tag) != 2: - raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be exactly 2 characters, e.g. "BX"') + if len(bx_tag) != 2 or not bx_tag.isalnum(): + raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be alphanumeric and exactly 2 characters, e.g. "BX"') if len(input) > 2: raise click.BadParameter('inputs must be 1 BAM file or 2 FASTQ files.') if len(input) == 1: diff --git a/harpy/reports/align_bxstats.Rmd b/harpy/reports/align_bxstats.Rmd index df37ea4b5..4c21ce673 100644 --- a/harpy/reports/align_bxstats.Rmd +++ b/harpy/reports/align_bxstats.Rmd @@ -55,8 +55,8 @@ process_input <- function(infile){ tb[tb$valid != "invalidBX", "valid"] <- "validBX" tb$valid <- gsub("BX", " BX", tb$valid) # isolate non-singletons b/c molecules with 1 read pair aren't linked reads - multiplex_df <- filter(tb, valid == "valid BX", reads > 2) - singletons <- sum(tb$reads <= 2 & tb$valid == "valid BX") + multiplex_df <- filter(tb, valid == "valid BX", reads >= 2) + singletons <- sum(tb$reads < 2 & tb$valid == "valid BX") tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+") tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer() tot_mol <- sum(tb$valid == "valid BX") @@ -116,7 +116,8 @@ if(nrow(aggregate_df) == 0){ This report aggregates the barcode-specific information from the alignments that were created using `harpy align`. Detailed information for any one sample can be found in that sample's individual report. The table below is an aggregation -of data for each sample based on their `*.bxstats.gz` file. +of data for each sample based on their `*.bxstats.gz` file. Every column after `% valid bx` +ignores singletons in its calculations. - `avg` refers to the average (arithmetic mean) - `SEM` refers to the Standard Error of the mean diff --git a/harpy/reports/align_stats.Rmd b/harpy/reports/align_stats.Rmd index 78b6a4653..d956fcd86 100644 --- a/harpy/reports/align_stats.Rmd +++ b/harpy/reports/align_stats.Rmd @@ -77,7 +77,7 @@ totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer() tot_valid <- sum(valids$reads) tot_invalid <- sum(invalids$reads) -non_singletons <- valids[valids$reads >2, ] +non_singletons <- valids[valids$reads >= 2, ] n_non_singleton_mol <- nrow(non_singletons) ``` @@ -131,7 +131,7 @@ valueBox(scales::comma(tot_invalid), caption = "Invalid BX Records", color = "wa ### singletons ```{r valuebox_singletons} if (VALID_PRESENT){ - valueBox(round(sum(valids$reads <= 2)/nrow(valids), 2), caption = "% Singletons") + valueBox(round(sum(valids$reads < 2)/nrow(valids), 2), caption = "% Singletons") } else { valueBox("NA", caption = "% Singletons") }