Skip to content

Commit

Permalink
Improved singleton counting (#170)
Browse files Browse the repository at this point in the history
* fix name

* keep stderr open for common convention

* add useful script

* update how singletons are counted

* fix the last missing bc

* alphanumeric check
  • Loading branch information
pdimens authored Dec 6, 2024
1 parent 2b29a56 commit e1867c2
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ jobs:
- name: harpy downsample bam
shell: micromamba-shell {0}
run: harpy downsample -d 1 --random-seed 699 --quiet test/bam/sample1.bam
- name: harpy downsample bam
- name: harpy downsample fastq
shell: micromamba-shell {0}
run: harpy downsample -d 1 --quiet test/fastq/sample1.*
- name: harpy hpc
Expand Down
7 changes: 6 additions & 1 deletion harpy/bin/bx_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,13 @@ def writestats(x, writechrom, destination):
}
else:
# update the basic alignment info of the molecule
if read.is_forward:
# +1 for a forward read, whether it is paired or not
d[mi]["n"] += 1
elif read.is_reverse and not read.is_paired:
# +1 for reverse only if it's unpaired, so the paired read doesn't count twice
d[mi]["n"] += 1
d[mi]["bp"] += bp
d[mi]["n"] += 1
d[mi]["insert_len"] += isize
d[mi]["start"] = min(pos_start, d[mi]["start"])
d[mi]["end"] = max(pos_end, d[mi]["end"])
Expand Down
80 changes: 80 additions & 0 deletions harpy/bin/separate_singletons
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#! /usr/bin/env python

import os
import re
import sys
import argparse
import subprocess
import pysam

parser = argparse.ArgumentParser(
prog='separate_singletons',
description='Isolate singleton and non-singleton linked-read BAM records into separate files.',
usage = "separate_singletons -t threads -b barcode_tag -s singletons.bam input.bam > output.bam",
)
parser.add_argument("-b", dest = "bx_tag", metavar = "barcode_tag", type=str, default = "BX", help="The header tag with the barcode (default: %(default)s)")
parser.add_argument("-s", dest = "singletons", metavar = "singletons_file", type=str, default = "singletons.bam", help="Name of output singleton file (default: %(default)s)")
parser.add_argument("-t", dest = "threads", metavar="threads", type=int, default = 4, help="Number of threads to use (default: %(default)s)")
parser.add_argument('input', type = str, help = "Input bam file")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()
if args.threads <1:
parser.error("Threads supplied to -t ({args.threads}) must be positive (e.g. >1)")
if not os.path.exists(args.input):
parser.error(f"{args.input} was not found")
if len(args.bx_tag) != 2 or args.bx_tag.isalnum():
parser.error(f"The header tag supplied to -b ({args.bx_tag}) must be alphanumeric and exactly two characters long")

invalid_pattern = re.compile(r'[AaBbCcDd]00')
sorted_bam = f"{args.input[:-4]}.bxsort.bam"
subprocess.run(f"samtools sort -@ {args.threads} -o {sorted_bam} -t {args.bx_tag} {args.input}".split(), stderr=sys.stderr)
with (
pysam.AlignmentFile(sorted_bam, "rb", check_sq=False) as infile,
pysam.AlignmentFile(sys.stdout, "wb", template=infile) as nonsingleton,
pysam.AlignmentFile(args.singletons, "wb", template=infile) as singleton,
):
record_store = []
read_count = 0
last_barcode = None
for record in infile:
try:
barcode = record.get_tag(args.bx_tag)
if isinstance(barcode, int):
pass # an int from an MI-tharype tag
elif invalid_pattern.search(barcode):
continue
except KeyError:
continue
# write the stored records when the barcode changes
if last_barcode and barcode != last_barcode:
if read_count > 1:
[nonsingleton.write(i) for i in record_store]
else:
[singleton.write(i) for i in record_store]
# reset the record store and read count
record_store = []
read_count = 0

record_store.append(record)
if record.is_forward:
# +1 for a forward read, whether it is paired or not
read_count += 1
elif record.is_reverse and not record.is_paired:
# +1 for reverse only if it's unpaired, so the paired read doesn't count twice
read_count += 1
# update the last barcode with the current one
last_barcode = barcode
# After the for loop ends
if record_store:
if read_count > 1:
for i in record_store:
nonsingleton.write(i)
else:
for i in record_store:
singleton.write(i)

# final housekeeping to remove intermediate
os.remove(sorted_bam)
6 changes: 3 additions & 3 deletions harpy/bin/separate_validbx
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#! /usr/bin/env bash

if [[ -z "$1" ]]; then
echo -e "\n Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes (stderr)."
echo -e "\n [usage] separate_validbx input.bam > valid.bam 2> invalid.bam"
echo -e "\n Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes."
echo -e "\n [usage] separate_validbx invalid.bam input.bam > valid.bam"
exit
fi

samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput /dev/stderr $1
samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput $1 $2
4 changes: 2 additions & 2 deletions harpy/downsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def downsample(input, prefix, downsample, invalid, bx_tag, random_seed, threads,
- `drop`: don't output any invalid/missing barcodes
"""
# validate input files as either 1 bam or 2 fastq
if len(bx_tag) != 2:
raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be exactly 2 characters, e.g. "BX"')
if len(bx_tag) != 2 or not bx_tag.isalnum():
raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be alphanumeric and exactly 2 characters, e.g. "BX"')
if len(input) > 2:
raise click.BadParameter('inputs must be 1 BAM file or 2 FASTQ files.')
if len(input) == 1:
Expand Down
7 changes: 4 additions & 3 deletions harpy/reports/align_bxstats.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ process_input <- function(infile){
tb[tb$valid != "invalidBX", "valid"] <- "validBX"
tb$valid <- gsub("BX", " BX", tb$valid)
# isolate non-singletons b/c molecules with 1 read pair aren't linked reads
multiplex_df <- filter(tb, valid == "valid BX", reads > 2)
singletons <- sum(tb$reads <= 2 & tb$valid == "valid BX")
multiplex_df <- filter(tb, valid == "valid BX", reads >= 2)
singletons <- sum(tb$reads < 2 & tb$valid == "valid BX")
tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+")
tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer()
tot_mol <- sum(tb$valid == "valid BX")
Expand Down Expand Up @@ -116,7 +116,8 @@ if(nrow(aggregate_df) == 0){
This report aggregates the barcode-specific information from the alignments
that were created using `harpy align`. Detailed information for any one sample
can be found in that sample's individual report. The table below is an aggregation
of data for each sample based on their `*.bxstats.gz` file.
of data for each sample based on their `*.bxstats.gz` file. Every column after `% valid bx`
ignores singletons in its calculations.

- `avg` refers to the average (arithmetic mean)
- `SEM` refers to the Standard Error of the mean
Expand Down
4 changes: 2 additions & 2 deletions harpy/reports/align_stats.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer()
tot_valid <- sum(valids$reads)
tot_invalid <- sum(invalids$reads)
non_singletons <- valids[valids$reads >2, ]
non_singletons <- valids[valids$reads >= 2, ]
n_non_singleton_mol <- nrow(non_singletons)
```

Expand Down Expand Up @@ -131,7 +131,7 @@ valueBox(scales::comma(tot_invalid), caption = "Invalid BX Records", color = "wa
### singletons
```{r valuebox_singletons}
if (VALID_PRESENT){
valueBox(round(sum(valids$reads <= 2)/nrow(valids), 2), caption = "% Singletons")
valueBox(round(sum(valids$reads < 2)/nrow(valids), 2), caption = "% Singletons")
} else {
valueBox("NA", caption = "% Singletons")
}
Expand Down

0 comments on commit e1867c2

Please sign in to comment.