From e1867c20a9a1efea85adb1d86c9d9a69b424bcf6 Mon Sep 17 00:00:00 2001
From: "Pavel V. Dimens" <19176506+pdimens@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:41:32 -0500
Subject: [PATCH] Improved singleton counting (#170)

* fix name

* keep stderr open for common convention

* add useful script

* update how singletons are counted

* fix the last missing bc

* alphanumeric check
---
 .github/workflows/tests.yml     |  2 +-
 harpy/bin/bx_stats.py           |  7 ++-
 harpy/bin/separate_singletons   | 80 +++++++++++++++++++++++++++++++++
 harpy/bin/separate_validbx      |  6 +--
 harpy/downsample.py             |  4 +-
 harpy/reports/align_bxstats.Rmd |  7 +--
 harpy/reports/align_stats.Rmd   |  4 +-
 7 files changed, 98 insertions(+), 12 deletions(-)
 create mode 100755 harpy/bin/separate_singletons

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a9d38045c..00953809b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -831,7 +831,7 @@ jobs:
       - name: harpy downsample bam
         shell: micromamba-shell {0}
         run: harpy downsample -d 1 --random-seed 699 --quiet test/bam/sample1.bam
-      - name: harpy downsample bam
+      - name: harpy downsample fastq
         shell: micromamba-shell {0}
         run: harpy downsample -d 1 --quiet test/fastq/sample1.*
       - name: harpy hpc
diff --git a/harpy/bin/bx_stats.py b/harpy/bin/bx_stats.py
index b46176a5d..fc889a026 100755
--- a/harpy/bin/bx_stats.py
+++ b/harpy/bin/bx_stats.py
@@ -141,8 +141,13 @@ def writestats(x, writechrom, destination):
         }
     else:
         # update the basic alignment info of the molecule
+        if read.is_forward:
+            # +1 for a forward read, whether it is paired or not
+            d[mi]["n"]  += 1
+        elif read.is_reverse and not read.is_paired:
+            # +1 for reverse only if it's unpaired, so the paired read doesn't count twice
+            d[mi]["n"]  += 1
         d[mi]["bp"] += bp
-        d[mi]["n"]  += 1
         d[mi]["insert_len"] += isize
         d[mi]["start"] = min(pos_start, d[mi]["start"])
         d[mi]["end"] = max(pos_end, d[mi]["end"])
diff --git a/harpy/bin/separate_singletons b/harpy/bin/separate_singletons
new file mode 100755
index 000000000..16e3e2d17
--- /dev/null
+++ b/harpy/bin/separate_singletons
@@ -0,0 +1,80 @@
+#! /usr/bin/env python
+
+import os
+import re
+import sys
+import argparse
+import subprocess
+import pysam
+
+parser = argparse.ArgumentParser(
+    prog='separate_singletons',
+    description='Isolate singleton and non-singleton linked-read BAM records into separate files.',
+    usage = "separate_singletons -t threads -b barcode_tag -s singletons.bam input.bam > output.bam",
+    )
+parser.add_argument("-b", dest = "bx_tag", metavar = "barcode_tag", type=str, default = "BX", help="The header tag with the barcode (default: %(default)s)")
+parser.add_argument("-s", dest = "singletons", metavar = "singletons_file", type=str, default = "singletons.bam", help="Name of output singleton file (default: %(default)s)")
+parser.add_argument("-t", dest = "threads", metavar="threads", type=int, default = 4, help="Number of threads to use (default: %(default)s)")
+parser.add_argument('input', type = str, help = "Input bam file")
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+
+args = parser.parse_args()
+if args.threads <1:
+    parser.error("Threads supplied to -t ({args.threads}) must be positive (e.g. >1)")
+if not os.path.exists(args.input):
+    parser.error(f"{args.input} was not found")
+if len(args.bx_tag) != 2 or args.bx_tag.isalnum():
+    parser.error(f"The header tag supplied to -b ({args.bx_tag}) must be alphanumeric and exactly two characters long")
+
+invalid_pattern = re.compile(r'[AaBbCcDd]00')
+sorted_bam = f"{args.input[:-4]}.bxsort.bam"
+subprocess.run(f"samtools sort -@ {args.threads} -o {sorted_bam} -t {args.bx_tag} {args.input}".split(), stderr=sys.stderr)
+with (
+    pysam.AlignmentFile(sorted_bam, "rb", check_sq=False) as infile,
+    pysam.AlignmentFile(sys.stdout, "wb", template=infile) as nonsingleton,
+    pysam.AlignmentFile(args.singletons, "wb", template=infile) as singleton,                      
+):
+    record_store = []
+    read_count = 0
+    last_barcode = None
+    for record in infile:
+        try:
+            barcode = record.get_tag(args.bx_tag)
+            if isinstance(barcode, int):
+                pass # an int from an MI-tharype tag
+            elif invalid_pattern.search(barcode):
+                continue
+        except KeyError:
+            continue
+        # write the stored records when the barcode changes
+        if last_barcode and barcode != last_barcode:
+            if read_count > 1:
+                [nonsingleton.write(i) for i in record_store]
+            else:
+                [singleton.write(i) for i in record_store]
+            # reset the record store and read count
+            record_store = []
+            read_count = 0
+
+        record_store.append(record)
+        if record.is_forward:
+            # +1 for a forward read, whether it is paired or not
+            read_count += 1
+        elif record.is_reverse and not record.is_paired:
+            # +1 for reverse only if it's unpaired, so the paired read doesn't count twice
+            read_count += 1
+        # update the last barcode with the current one
+        last_barcode = barcode
+    # After the for loop ends
+    if record_store:
+        if read_count > 1:
+            for i in record_store:
+                nonsingleton.write(i)
+        else:
+            for i in record_store:
+                singleton.write(i)
+
+# final housekeeping to remove intermediate
+os.remove(sorted_bam)
\ No newline at end of file
diff --git a/harpy/bin/separate_validbx b/harpy/bin/separate_validbx
index 795e7ff7c..b1b431c96 100755
--- a/harpy/bin/separate_validbx
+++ b/harpy/bin/separate_validbx
@@ -1,9 +1,9 @@
 #! /usr/bin/env bash
 
 if [[ -z "$1" ]]; then
-    echo -e "\n  Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes (stderr)."
-    echo -e "\n [usage] separate_validbx input.bam > valid.bam 2> invalid.bam"
+    echo -e "\n  Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes."
+    echo -e "\n [usage] separate_validbx invalid.bam input.bam > valid.bam"
     exit
 fi
 
-samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput /dev/stderr $1
\ No newline at end of file
+samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput $1 $2
\ No newline at end of file
diff --git a/harpy/downsample.py b/harpy/downsample.py
index e0b2c8a1c..62beccb40 100644
--- a/harpy/downsample.py
+++ b/harpy/downsample.py
@@ -47,8 +47,8 @@ def downsample(input, prefix, downsample, invalid, bx_tag, random_seed, threads,
     - `drop`: don't output any invalid/missing barcodes
     """
     # validate input files as either 1 bam or 2 fastq
-    if len(bx_tag) != 2:
-        raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be exactly 2 characters, e.g. "BX"')
+    if len(bx_tag) != 2 or not bx_tag.isalnum():
+        raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be alphanumeric and exactly 2 characters, e.g. "BX"')
     if len(input) > 2:
         raise click.BadParameter('inputs must be 1 BAM file or 2 FASTQ files.')
     if len(input) == 1:
diff --git a/harpy/reports/align_bxstats.Rmd b/harpy/reports/align_bxstats.Rmd
index df37ea4b5..4c21ce673 100644
--- a/harpy/reports/align_bxstats.Rmd
+++ b/harpy/reports/align_bxstats.Rmd
@@ -55,8 +55,8 @@ process_input <- function(infile){
   tb[tb$valid != "invalidBX", "valid"] <- "validBX"
   tb$valid <- gsub("BX", " BX", tb$valid)
   # isolate non-singletons b/c molecules with 1 read pair aren't linked reads
-  multiplex_df <- filter(tb, valid == "valid BX", reads > 2)
-  singletons <- sum(tb$reads <= 2 & tb$valid == "valid BX")
+  multiplex_df <- filter(tb, valid == "valid BX", reads >= 2)
+  singletons <- sum(tb$reads < 2 & tb$valid == "valid BX")
   tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+")
   tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer()
   tot_mol <- sum(tb$valid == "valid BX")
@@ -116,7 +116,8 @@ if(nrow(aggregate_df) == 0){
 This report aggregates the barcode-specific information from the alignments
 that were created using `harpy align`. Detailed information for any one sample
 can be found in that sample's individual report. The table below is an aggregation
-of data for each sample based on their `*.bxstats.gz` file.
+of data for each sample based on their `*.bxstats.gz` file. Every column after `% valid bx`
+ignores singletons in its calculations. 
 
 - `avg` refers to the average (arithmetic mean)
 - `SEM` refers to the Standard Error of the mean
diff --git a/harpy/reports/align_stats.Rmd b/harpy/reports/align_stats.Rmd
index 78b6a4653..d956fcd86 100644
--- a/harpy/reports/align_stats.Rmd
+++ b/harpy/reports/align_stats.Rmd
@@ -77,7 +77,7 @@ totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer()
 tot_valid <- sum(valids$reads)
 tot_invalid <- sum(invalids$reads)
 
-non_singletons <- valids[valids$reads >2, ]
+non_singletons <- valids[valids$reads >= 2, ]
 n_non_singleton_mol <- nrow(non_singletons)
 ```
 
@@ -131,7 +131,7 @@ valueBox(scales::comma(tot_invalid), caption = "Invalid BX Records", color = "wa
 ### singletons
 ```{r valuebox_singletons}
 if (VALID_PRESENT){
-  valueBox(round(sum(valids$reads <= 2)/nrow(valids), 2), caption = "% Singletons")
+  valueBox(round(sum(valids$reads < 2)/nrow(valids), 2), caption = "% Singletons")
 } else {
   valueBox("NA", caption = "% Singletons")
 }