Skip to content

Commit

Permalink
Parse barcodes from read IDs in demultiplexed mode (#26)
Browse files Browse the repository at this point in the history
  • Loading branch information
mtomko authored Oct 30, 2023
1 parent 2a34081 commit ec90da6
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 22 deletions.
4 changes: 3 additions & 1 deletion src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,10 @@ object PoolQ {
ExactReference(referenceData.mappings, identity, includeAmbiguous = false)
}

val colBarcodePolicyOrLength: Either[Int, BarcodePolicy] = colBarcodePolicyOpt.toRight(colReference.barcodeLength)

val barcodes: CloseableIterable[Barcodes] =
barcodeSource(config.input, rowBarcodePolicy, revRowBarcodePolicyOpt, colBarcodePolicyOpt, umiInfo.map(_._2))
barcodeSource(config.input, rowBarcodePolicy, revRowBarcodePolicyOpt, colBarcodePolicyOrLength, umiInfo.map(_._2))

lazy val unexpectedSequenceCacheDir: Option[Path] =
if (config.skipUnexpectedSequenceReport) None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,12 @@ final case class PoolQConfig(
def isPairedEnd =
reverseRowBarcodePolicyStr.isDefined &&
(input.readsSourceE match {
case Right(ReadsSource.PairedEnd(_, _, _)) => true
case _ => false
case Right(ReadsSource.PairedEnd(_, _, _)) => true
case Right(ReadsSource.DmuxedPairedEnd(_, _)) => true
case Right(ReadsSource.SelfContained(_)) => false
case Right(ReadsSource.Split(_, _)) => false
case Right(ReadsSource.Dmuxed(_)) => false
case Left(_) => false
})

}
Expand Down
18 changes: 18 additions & 0 deletions src/main/scala/org/broadinstitute/gpp/poolq3/barcode/Dmuxed.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Copyright (c) 2022 The Broad Institute, Inc. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.broadinstitute.gpp.poolq3.barcode

object Dmuxed {

private[barcode] def barcodeFromId(length: Int): String => Option[FoundBarcode] = {
val regex = s"@.*[^ACGTN]([ACGTN]{$length})$$".r
_ match {
case regex(barcode) => Some(FoundBarcode(barcode.toCharArray, 0))
case _ => None
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,15 @@ package org.broadinstitute.gpp.poolq3.barcode
import org.broadinstitute.gpp.poolq3.parser.{CloseableIterable, CloseableIterator, DmuxedIterable}
import org.broadinstitute.gpp.poolq3.types.Read

final class DmuxedBarcodeSource(parser: DmuxedIterable, rowPolicy: BarcodePolicy, umiPolicyOpt: Option[BarcodePolicy])
extends CloseableIterable[Barcodes] {
final class DmuxedBarcodeSource(
parser: DmuxedIterable,
rowPolicy: BarcodePolicy,
umiPolicyOpt: Option[BarcodePolicy],
colBarcodeLength: Int
) extends CloseableIterable[Barcodes] {

// used to attempt to parse barcodes out of ids if the file has no associated barcode
private val colBarcodeParser = Dmuxed.barcodeFromId(colBarcodeLength)

private def colBarcodeOpt = parser.indexBarcode

Expand All @@ -20,7 +27,7 @@ final class DmuxedBarcodeSource(parser: DmuxedIterable, rowPolicy: BarcodePolicy
val nextRead = iterator.next()
val rowBarcodeOpt = rowPolicy.find(nextRead)
val umiBarcodeOpt = umiPolicyOpt.flatMap(_.find(nextRead))
Barcodes(rowBarcodeOpt, None, colBarcodeOpt, umiBarcodeOpt)
Barcodes(rowBarcodeOpt, None, colBarcodeOpt.orElse(colBarcodeParser(nextRead.id)), umiBarcodeOpt)
}

override def close(): Unit = iterator.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class DmuxedPairedEndBarcodeSource(
rowPolicy: BarcodePolicy,
revRowPolicy: BarcodePolicy,
umiPolicyOpt: Option[BarcodePolicy],
readIdCheckPolicy: ReadIdCheckPolicy
readIdCheckPolicy: ReadIdCheckPolicy,
colBarcodeLength: Int
) extends CloseableIterable[Barcodes] {

// the index barcode _is_ the column barcode; we get it from the row parser
Expand All @@ -24,6 +25,9 @@ class DmuxedPairedEndBarcodeSource(
private[this] class BarcodeIterator(rowIterator: CloseableIterator[Read], revRowIterator: CloseableIterator[Read])
extends CloseableIterator[Barcodes] {

// used to attempt to parse barcodes out of ids if the file has no associated barcode
private val colBarcodeParser = Dmuxed.barcodeFromId(colBarcodeLength)

final override def hasNext: Boolean = rowIterator.hasNext && revRowIterator.hasNext

final override def next(): Barcodes = {
Expand All @@ -33,7 +37,7 @@ class DmuxedPairedEndBarcodeSource(
val rowBarcodeOpt = rowPolicy.find(nextRow)
val revRowBarcodeOpt = revRowPolicy.find(nextRevRow)
val umiBarcodeOpt = umiPolicyOpt.flatMap(_.find(nextRow))
Barcodes(rowBarcodeOpt, revRowBarcodeOpt, colBarcodeOpt, umiBarcodeOpt)
Barcodes(rowBarcodeOpt, revRowBarcodeOpt, colBarcodeOpt.orElse(colBarcodeParser(nextRow.id)), umiBarcodeOpt)
}

final override def close(): Unit =
Expand Down
18 changes: 10 additions & 8 deletions src/main/scala/org/broadinstitute/gpp/poolq3/barcode/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ package object barcode {
config: PoolQInput,
rowBarcodePolicy: BarcodePolicy,
revRowBarcodePolicyOpt: Option[BarcodePolicy],
colBarcodePolicyOpt: Option[BarcodePolicy],
colBarcodePolicyOpt: Either[Int, BarcodePolicy],
umiBarcodePolicyOpt: Option[BarcodePolicy]
): CloseableIterable[Barcodes] =
(config.readsSource, revRowBarcodePolicyOpt, colBarcodePolicyOpt) match {
case (ReadsSource.Split(index, forward), None, Some(colBarcodePolicy)) =>
case (ReadsSource.Split(index, forward), None, Right(colBarcodePolicy)) =>
new TwoFileBarcodeSource(
parserFor(forward.toList),
parserFor(index.toList),
Expand All @@ -39,7 +39,7 @@ package object barcode {
umiBarcodePolicyOpt,
config.readIdCheckPolicy
)
case (ReadsSource.PairedEnd(index, forward, reverse), Some(revRowBarcodePolicy), Some(colBarcodePolicy)) =>
case (ReadsSource.PairedEnd(index, forward, reverse), Some(revRowBarcodePolicy), Right(colBarcodePolicy)) =>
new ThreeFileBarcodeSource(
parserFor(forward.toList),
parserFor(reverse.toList),
Expand All @@ -50,22 +50,24 @@ package object barcode {
umiBarcodePolicyOpt,
config.readIdCheckPolicy
)
case (ReadsSource.SelfContained(paths), None, Some(colBarcodePolicy)) =>
case (ReadsSource.SelfContained(paths), None, Right(colBarcodePolicy)) =>
new SingleFileBarcodeSource(parserFor(paths.toList), rowBarcodePolicy, colBarcodePolicy, umiBarcodePolicyOpt)
case (ReadsSource.Dmuxed(read1), _, _) =>
case (ReadsSource.Dmuxed(read1), _, Left(colBarcodeLength)) =>
new DmuxedBarcodeSource(
DmuxedIterable(read1.toList, parserFor(_).iterator),
rowBarcodePolicy,
umiBarcodePolicyOpt
umiBarcodePolicyOpt,
colBarcodeLength
)
case (ReadsSource.DmuxedPairedEnd(read1, read2), Some(revRowBarcodePolicy), _) =>
case (ReadsSource.DmuxedPairedEnd(read1, read2), Some(revRowBarcodePolicy), Left(colBarcodeLength)) =>
new DmuxedPairedEndBarcodeSource(
DmuxedIterable(read1.toList, parserFor(_).iterator),
DmuxedIterable(read2.toList, parserFor(_).iterator),
rowBarcodePolicy,
revRowBarcodePolicy,
umiBarcodePolicyOpt,
config.readIdCheckPolicy
config.readIdCheckPolicy,
colBarcodeLength
)
case _ =>
throw new IllegalArgumentException("Incompatible reads and barcode policy settings")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,12 @@ object DmuxedIterable {
val data2: List[(Option[String], List[Read])] = data.map { case (bco, seqs) =>
(bco, seqs.zipWithIndex.map { case (seq, i) => Read(i.toString, seq) })
}
new DmuxedIterableImpl(data2, CloseableIterator.ofList)
DmuxedIterable.forReads(data2)
}

def forReads(data: List[(Option[String], List[Read])]): DmuxedIterable =
new DmuxedIterableImpl(data, CloseableIterator.ofList)

private class DmuxedIterableImpl[A](src: Iterable[(Option[String], A)], makeIterator: A => CloseableIterator[Read])
extends DmuxedIterable {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package org.broadinstitute.gpp.poolq3.barcode
import cats.syntax.all._
import munit.FunSuite
import org.broadinstitute.gpp.poolq3.parser.DmuxedIterable
import org.broadinstitute.gpp.poolq3.types.Read

class DmuxedBarcodeSourceTest extends FunSuite {

Expand All @@ -26,7 +27,7 @@ class DmuxedBarcodeSourceTest extends FunSuite {
)
)

val src = new DmuxedBarcodeSource(iterable, rowPolicy, None)
val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 8)
assertEquals(
src.toList,
List(
Expand All @@ -40,9 +41,26 @@ class DmuxedBarcodeSourceTest extends FunSuite {
)
}

test("barcodes from read IDs") {
val undeterminedReads = List(Read("@eeeeee ACGTAA", "AAAAAAAAAA"), Read("@eeeeee ACTCAG", "CCCCCCCCCC"))
val aacctgReads = List(Read("@a read", "GGGGGGGGGG"), Read("@another read", "TTTTTTTTTT"))
val iterable = DmuxedIterable.forReads(List(None -> undeterminedReads, Some("AACCTG") -> aacctgReads))

val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 6)
assertEquals(
src.toList,
List(
fb("ACGTAA", "AAAAAAAAAA"),
fb("ACTCAG", "CCCCCCCCCC"),
fb("AACCTG", "GGGGGGGGGG"),
fb("AACCTG", "TTTTTTTTTT")
)
)
}

test("nothing works") {
val iterable = DmuxedIterable(Nil)
val src = new DmuxedBarcodeSource(iterable, rowPolicy, None)
val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 8)
assertEquals(src.toList, Nil)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ package org.broadinstitute.gpp.poolq3.barcode
import cats.syntax.all._
import munit.FunSuite
import org.broadinstitute.gpp.poolq3.parser.DmuxedIterable
import org.broadinstitute.gpp.poolq3.types.ReadIdCheckPolicy
import org.broadinstitute.gpp.poolq3.types.{Read, ReadIdCheckPolicy}

class DmuxedPairedEndBarcodeSourceTest extends FunSuite {

Expand All @@ -32,7 +32,7 @@ class DmuxedPairedEndBarcodeSourceTest extends FunSuite {

val iter2 = DmuxedIterable(List(None -> List("AGA", "CTC", "GAG"), Some("CTCGAG") -> List("TGT", "CAC", "TCT")))

val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax)
val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax, 8)
assertEquals(
src.toList,
List(
Expand All @@ -46,10 +46,31 @@ class DmuxedPairedEndBarcodeSourceTest extends FunSuite {
)
}

test("barcodes from read IDs") {
val undeterminedRead1s = List(Read("@eeeeee ACGTAA", "AAAA"), Read("@eeeeee ACTCAG", "CCCC"))
val undeterminedRead2s = List(Read("@eeeeee ACGTAA", "AAA"), Read("@eeeeee ACTCAG", "CCC"))
val aacctgRead1s = List(Read("@a read", "GGGG"), Read("@another read", "TTTT"))
val aacctgRead2s = List(Read("@a read", "GGG"), Read("@another read", "TTT"))

val iter1 = DmuxedIterable.forReads(List(None -> undeterminedRead1s, Some("AACCTG") -> aacctgRead1s))
val iter2 = DmuxedIterable.forReads(List(None -> undeterminedRead2s, Some("AACCTG") -> aacctgRead2s))

val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax, 6)
assertEquals(
src.toList,
List(
fb("ACGTAA", "AAAA", "AAA"),
fb("ACTCAG", "CCCC", "CCC"),
fb("AACCTG", "GGGG", "GGG"),
fb("AACCTG", "TTTT", "TTT")
)
)
}

test("nothing works") {
val i1 = DmuxedIterable(Nil)
val i2 = DmuxedIterable(Nil)
val src = new DmuxedPairedEndBarcodeSource(i1, i2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Illumina)
val src = new DmuxedPairedEndBarcodeSource(i1, i2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Illumina, 8)
assertEquals(src.toList, Nil)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) 2022 The Broad Institute, Inc. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.broadinstitute.gpp.poolq3.barcode

import munit.FunSuite

class DmuxedTest extends FunSuite {

test("extracting a barcode with Ns from an illumina read") {
assertEquals(
Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:1163:1000 1:N:0:GGNGNANT"),
Some(FoundBarcode("GGNGNANT".toCharArray, 0))
)
}

test("extracting a barcode with no Ns from an illumina read") {
assertEquals(
Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:AAATGCGA"),
Some(FoundBarcode("AAATGCGA".toCharArray, 0))
)
}

test("ignore a barcode-like sequence that's too long") {
assertEquals(Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:AAATGCGAGG"), None)
}

test("ignore a barcode-like sequence that's too short") {
assertEquals(Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:TGCGAGG"), None)
}

}

0 comments on commit ec90da6

Please sign in to comment.