diff --git a/DonorBamRead.py b/DonorBamRead.py
index 5a9d38d..b501ca1 100644
--- a/DonorBamRead.py
+++ b/DonorBamRead.py
@@ -2,88 +2,312 @@
class DonorBamRead:
- # Saves the required BAM read data.
- def __init__(self, readid, readpn, readchrom, readstart,
- readlen, readseq, readquals, mapqual):
+ """The DonorBamRead is used to save data from an aligned read extracted from a BAM or CRAM file.
+
+ DonorBamRead can be used to save the data from aligned reads that were extracted from a BAM or CRAM file.
+
+ Attributes
+ ----------
+ bam_read_id : str
+ Read identifier
+ bam_read_flag : int
+ The read bitwise flag value
+ bam_read_pairnum : str
+ Read pair number (1 or 2)
+ bam_read_chrom : str
+ Chromosome name the read is located on
+ bam_read_ref_pos : int
+ Leftmost genomic position of the read
+ bam_read_length : int
+ Read length
+ bam_read_end_pos : int
+ Read rightmost position (determined by alignment)
+ bam_read_cigar : str
+ Read CIGAR string
+ bam_read_rnext : str
+ Chromosome name of the read mate
+ bam_read_pnext : int
+ Leftmost genomic position of the read mate
+ bam_read_tlen : int
+ Read TLEN value
+ bam_read_seq : str
+ Read sequence
+ bam_read_qual : str
+ Read ASCII quality
+ bam_read_map_qual : int
+ Read MAPQ value
+ """
+
+ def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, readend, readcigar, readrnext,
+ readpnext, readtlen, readseq, readquals, readmapq):
+ """Saves all required data of an aligned read from a BAM or CRAM file.
+
+ Parameters
+ ----------
+ readid : str
+ The read identifier
+ readflag : str
+ Aligned read bitwise flag
+ readpn : str
+ The read pair number
+ readchrom : str
+ The chromosome name the read is located on
+ readstart : int
+ The leftmost genomic position of the mapped read
+ readlen : int
+ The total length of the read
+ readcigar : str
+ Aligned read CIGAR string
+ readrnext : str
+ Chromsome name the read mate is located on
+ readpnext : int
+ Leftmost genomic position of the read mate
+ readtlen : int
+ Aligned read TLEN value
+ readseq : str
+ The read sequence
+ reqdquals : str
+ The qualities of the read in ascii
+ mapqual : int
+ The MAPQ mapping quality
+ """
self.bam_read_id = readid
+ self.bam_read_flag = readflag
self.bam_read_pairnum = readpn
self.bam_read_chrom = readchrom
self.bam_read_ref_pos = readstart
self.bam_read_length = readlen
+ self.bam_read_end_pos = readend
+ self.bam_read_cigar = readcigar
+ self.bam_read_rnext = readrnext
+ self.bam_read_pnext = readpnext
+ self.bam_read_tlen = readtlen
self.bam_read_seq = readseq
self.bam_read_qual = readquals
- self.bam_read_map_qual = mapqual
+ self.bam_read_map_qual = readmapq
# ===METHODS TO GET SAVED DATA FROM THE DONORBAMREAD=======================
- # Returns the BAM read identifier.
def get_bam_read_id(self):
+ """Returns the identifier of the read.
+
+ Returns
+ -------
+ self.bam_read_id : str
+ The read identifier
+ """
return self.bam_read_id
- # Returns the BAM read pair number (1 or 2).
+ def get_bam_read_flag(self):
+ """Returns the read alignment flag.
+
+ Returns
+ -------
+ self.bam_read_flag : str
+ Read alignment flag
+ """
+
def get_bam_read_pair_number(self):
+ """Returns the pair number of the read.
+ This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads.
+
+ Returns
+ -------
+ self.bam_read_pairnum : str
+ The read pair number
+ """
return self.bam_read_pairnum
- # Returns the BAM read chromosome.
def get_bam_read_chrom(self):
+ """Returns the name of the chromosome where the read has been mapped.
+
+ Returns
+ -------
+ self.bam_read_chrom : str
+ The chromosome name where the read is mapped
+ """
return self.bam_read_chrom
- # Returns the BAM read starting position on the reference sequence.
def get_bam_read_ref_pos(self):
+ """Returns the genomic mapping position (the left most position) of the read.
+
+ Returns
+ -------
+ self.bam_read_ref_pos : int
+ The read leftmost genomic position
+ """
return self.bam_read_ref_pos
- # Returns the BAM read length.
def get_bam_read_length(self):
+ """Returns the length of the read. In VaSeBuilder this length is calculated by means of the CIGAR string by
+ using the pysam method infer_read_length()
+
+ Returns
+ -------
+ self.bam_read_length : int
+ The length of the read
+ """
return self.bam_read_length
- # Returns the BAM read ending position on the reference (calculated
- # as starting position + the length of the read).
def get_bam_read_ref_end(self):
- if self.bam_read_length is not None:
- return self.bam_read_ref_pos + self.bam_read_length
- return -1
+ """Returns the rightmost genomic position of the read.
+
+ Returns
+ -------
+ self.bam_read_end_pos : int
+ The rightmost position of the read
+ """
+ return self.bam_read_end_pos
+
+ def get_bam_read_cigar(self):
+ """Returns the read alignment CIGAR string
+
+ Returns
+ -------
+ self.bam_read_cigar : str
+ Read alignment CIGAR string
+ """
+ return self.bam_read_cigar
+
+ def get_bam_read_rnext(self):
+ """Returns the chromosome name the read mate is located on.
+
+ Returns
+ -------
+ self.bam_read_rnext : str
+ Chromosome name the read mate is located on
+ """
+ return self.bam_read_rnext
+
+ def get_bam_read_pnext(self):
+ """Returns the leftmost genomic position of the read mate.
+
+ Returns
+ -------
+ self.bam_read_pnext : ibnt
+ Leftmost genomic position of the read mate
+ """
+ return self.bam_read_pnext
+
+ def get_bam_read_tlen(self):
+ """Returns the read alignment TLEN value.
+
+ Returns
+ -------
+ self.bam_read_tlen : int
+ Read alignment TLEN value
+ """
- # Returns the BAM read sequence.
def get_bam_read_sequence(self):
+ """Returns the read sequence as a String.
+
+ Returns
+ -------
+ self.bam_read_seq : str
+ The sequence of the read
+ """
return self.bam_read_seq
- # Returns the BAM read quality scores.
def get_bam_read_qual(self):
+ """Returns the read quality.
+
+ Returns
+ -------
+ self.bam_read_qual : str
+ Read quality
+ """
return self.bam_read_qual
- # Returns the BAM read quality as an array of Q-Scores.
def get_bam_read_q_scores(self):
+ """Converts the String of ASCII quality scores to a list Q-Scores. The Q-Score for each ASCII quality character
+ are calculated by obtaining the unicode code point and subtracting 33.
+
+ Returns
+ -------
+ qscores : list of int
+ Q-Score of the read
+ """
qscores = []
for qualSymbol in self.bam_read_qual:
qscores.append(ord(qualSymbol)-33)
return qscores
- # Returns the maping quality of the BAM read.
def get_mapping_qual(self):
+ """Returns the mapping quality that was assigned to the read.
+
+ Returns
+ -------
+ self.bam_read_map_qual : int
+ The mapping quality (MAPQ)
+ """
return self.bam_read_map_qual
+ def read_has_hardclipped_bases(self):
+ """Returns whether the read has hardclipped bases.
+
+ The CIGAR string of the read is checked whether it contains an 'H' (Hardclipped bases).
+
+ Returns
+ -------
+ bool
+ True if read CIGAR contains 'H', False if not
+ """
+ if self.bam_read_cigar is not None:
+ return "H" in self.bam_read_cigar
+ return False
+
# ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD===================
- # Returns the average Q-Score.
def get_average_qscore(self):
+ """Calculates and returns the mean Q-Score of the read.
+
+ Returns
+ -------
+ float
+ The mean Q-Score
+ """
qscores = self.get_bam_read_q_scores()
return statistics.mean(qscores)
- # Returns the median Q-Score.
def get_median_qscore(self):
+ """Calculates and returns the median Q-Score of the read.
+
+ Returns
+ -------
+ int
+ The median Q-Score of the read
+ """
qscores = self.get_bam_read_q_scores()
return statistics.median(qscores)
# ===METHODS TO CHECK WHETHER THE BAM READ IS R1 OR R2=====================
- # Returns if the BAM read is the first (forward) read.
def is_read1(self):
+ """Returns whether the read is the forward/R1 read by checking if the pair number is set to '1'.
+
+ Returns
+ -------
+ bool
+ True if the read has pair number 1, otherwise False
+ """
return self.bam_read_pairnum == "1"
- # Returns if the BAM read is the second (reverse) read.
def is_read2(self):
+ """Returns whether the read is the reverse/R2 read by checking if the pair number is set to '2'.
+
+ Returns
+ -------
+ bool
+ True if the read has pair number 2, otherwise False
+ """
return self.bam_read_pairnum == "2"
# ===METHODS TO RETURN A STRING REPRESENTATION OF THE DONORBAMREAD OBJECT==
- # Returns a String representation.
def to_string(self):
+ """Returns a String with all the saved data, separated by tabs, of the read.
+
+ Returns
+ -------
+ str
+ String representation of the read.
+ """
return (str(self.bam_read_id) + "\t"
+ str(self.bam_read_pairnum) + "\t"
+ str(self.bam_read_chrom) + "\t"
@@ -93,8 +317,17 @@ def to_string(self):
+ str(self.bam_read_qual) + "\t"
+ str(self.bam_read_map_qual))
- # Returns the BAM read as a fastq sequence.
def get_as_fastq_seq(self, addpairnum=False):
+ """Returns the read as a FastQ entry.
+
+ If the 'addpairnum' argument is set to True the read pair number will be added at the end of the read
+ identifier as '/1' or '/2', depending on whether the read is the forward/R1 or reverse/R2 read.
+
+ Returns
+ -------
+ str
+ The read as FastQ entry
+ """
if addpairnum:
return ("@" + str(self.bam_read_id) + "/" + str(self.bam_read_pairnum) + "\n"
+ str(self.bam_read_seq) + "\n"
@@ -105,3 +338,16 @@ def get_as_fastq_seq(self, addpairnum=False):
+ str(self.bam_read_seq) + "\n"
+ "+\n"
+ str(self.bam_read_qual) + "\n")
+
+ def get_as_bam_read(self):
+ """Returns the read as a SAM/BAM/CRAM file entry.
+
+ Returns
+ -------
+ bamcram_entry : str
+ Read as SAM/BAM/CRAM entry
+ """
+ bamcram_entry = f"{self.bam_read_id}\t{self.bam_read_flag}\t{self.bam_read_chrom}\t{self.bam_read_ref_pos}" \
+ f"\t{self.bam_read_map_qual}t{self.bam_read_cigar}\t{self.bam_read_rnext}\t{self.bam_read_pnext}" \
+ f"\t{self.bam_read_tlen}\t{self.bam_read_seq}\t{self.bam_read_qual}"
+ return bamcram_entry
diff --git a/OverlapContext.py b/OverlapContext.py
index a65e106..e0863ca 100644
--- a/OverlapContext.py
+++ b/OverlapContext.py
@@ -2,10 +2,52 @@
class OverlapContext:
- # Saves the data associated with the context overlapping with the
- # variant.
+ """The OverlapContext saves an acceptor or donor context.
+
+ The OverlapContext can be used to save an acceptor or donor context with reads that overlap with the variant
+ directly.
+
+ Attributes
+ ----------
+ context_id : str
+ Identifier of the context
+ sample_id : str
+ Sample name/identifier the context was derived from
+ context_chrom : str
+ The chromosome name that the context is located on
+ context_origin : int
+ The variant genomic position that constructed the context
+ context_start : int
+ The leftmost genomic position of the context
+ context_end : int
+ The rightmost genomic position of the context
+ context_bam_reads : list of DonorBamRead
+ List of aligned reads
+ unmapped_read_mate_ids : list of str
+ List of read identifiers that have unmapped mates
+ """
+
def __init__(self, variantid, sampleid, ovconchrom, ovconorigin,
ovconstart, ovconend, bamreads):
+ """Saves the required data for an acceptor or donor context.
+
+ Parameters
+ ----------
+ variantid : str
+ Variant context identifier
+ sampleid : str
+ Sample name/identifier
+ ovconchrom : str
+ Chromosome name the context is located on
+ ovconorigin : int
+ Variant genomic position the context is constructed from
+ ovconstart : int
+ Leftmost genomic position of the context
+ ovconend : int
+ Rightmost genomic position of the context
+ bamreads : list of DonorBamRead
+ Reads and read mates overlapping with the context
+ """
self.context_id = variantid
self.sample_id = sampleid
self.context_chrom = ovconchrom
@@ -16,135 +58,307 @@ def __init__(self, variantid, sampleid, ovconchrom, ovconorigin,
self.unmapped_read_mate_ids = []
# ===METHODS TO GET DATA OF THE OVERLAP CONTEXT============================
- # Returns the context identifier.
+ def get_context(self):
+ """Returns the essential data of the context.
+
+ This data consists of the chromosome name the context is located on, the variant position the context is based
+ on and the leftmost (start) and rightmost (end) genomic position of the context.
+
+ Returns
+ -------
+ list
+ List of essential context data
+ """
+ return [self.context_chrom, self.context_origin, self.context_start, self.context_end]
+
def get_context_id(self):
+ """Returns the context identifier.
+
+ Returns
+ -------
+ self.context_id : str
+ The context identifier
+ """
return self.context_id
- # Returns the sample identifier the context was based on.
def get_sample_id(self):
+ """Returns the identifier of the sample the context is
+
+ Returns
+ -------
+ self.sample_id : str
+ The sample name/identifier
+ """
return self.sample_id
- # Returns the context chromosome name the context is located on.
def get_context_chrom(self):
+ """Returns the chromosome name that the context is located on.
+
+ Returns
+ -------
+ self.context_chrom : str
+ The context chromosome name
+ """
return self.context_chrom
- # Returns the origin position of the context.
def get_context_origin(self):
+ """Returns the the position of the variant that was used to construct the context.
+
+ Returns
+ -------
+ self.context_origin : int
+ Variant position the context is based on
+ """
return self.context_origin
- # Returns the context start position.
def get_context_start(self):
+ """Returns the left most genomic position of the context.
+
+ Returns
+ -------
+ self.context_start : int
+ Context starting position
+ """
return self.context_start
- # Returns the end position of the context.
def get_context_end(self):
+ """Returns the right most genomic position of the context.
+
+ Returns
+ -------
+ self.context_end : int
+ Context ending position
+ """
return self.context_end
- # Returns the bam reads associated with the context as a list of
- # BamRead objects.
def get_context_bam_reads(self):
+ """Returns the list of reads and their mates overlapping with the context.
+
+ Mates of reads overlapping with the context might not overlap with the context themselves. Each read is returned
+ as a DonorBamRead object."""
return self.context_bam_reads
- # Returns the list of BAM read ids that have an unmapped mate.
def get_unmapped_read_mate_ids(self):
+ """Returns a list with identifiers of reads that have an unmapped mate.
+
+ Returns
+ -------
+ self.unmapped_read_mate_ids : list of str
+ List with read identifiers that have unmapped mates
+ """
return self.unmapped_read_mate_ids
- # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE=============
- # ===OVERLAP CONTEXT===
- # Returns the lengt of the context.
+ # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE OVERLAP CONTEXT=============
def get_context_length(self):
+ """Subtracts the left most genomic position of the context from the right most and returns the difference.
+
+ Returns
+ -------
+ int
+ Context length
+ """
return abs(self.context_end - self.context_start)
- # Returns the distance of the context start from the context origin.
def get_start_distance_from_origin(self):
+ """Subtracts the left most position of the context from the variant position and returns the difference.
+
+ Returns
+ -------
+ int
+ Context start distance from variant position
+ """
return abs(self.context_origin - self.context_start)
- # Returns the distance of the context end from the context origin.
def get_end_distance_from_origin(self):
+ """Subtracts the variant position from the rightmost position of the context and returns the difference.
+
+ Returns
+ -------
+ int
+ Context end distance form variant position
+ """
return abs(self.context_end - self.context_origin)
# ===METHODS TO OBTAIN CONTEXT READ INFORMATION============================
- # Returns the number of saved context reads.
def get_number_of_context_reads(self):
+ """Returns the number of reads associated with the context.
+
+ Returns
+ -------
+ int
+ Number of context reads
+ """
if self.context_bam_reads is None:
return 0
return len(self.context_bam_reads)
- # Returns a list of BAM read identifiers in the current context.
def get_context_bam_read_ids(self):
+ """Returns a list of the identifiers of reads associated with the context.
+
+ Returns
+ -------
+ list of str
+ Identifiers of all reads
+ """
if self.context_bam_reads is None:
return [None]
return list(set([x.get_bam_read_id() for x in self.context_bam_reads]))
- # Returns a list of all left positions for all BAM reads.
def get_context_bam_read_starts(self):
+ """Returns a list of all let most position of all reads associated with the context.
+
+ Returns
+ -------
+ list of int or None
+ Leftmost positions of all reads
+ """
if self.context_bam_reads is None:
return [None]
return [x.get_bam_read_ref_pos() for x in self.context_bam_reads]
- # Returns a list of all left positions for all R1 BAM reads.
def get_context_bam_read_left_positions(self):
+ """Returns a list with the left most positions for all forward/R1 reads associated with the context.
+
+ Returns
+ -------
+ list of int or None
+ Leftmost genomic positions of all R1 reads
+ """
if self.context_bam_reads is None:
return [None]
return [x.get_bam_read_ref_pos()
for x in self.context_bam_reads if (x.is_read1())]
- # Returns a list of BAM read ending positions for all BAM reads.
def get_context_bam_read_ends(self):
+ """Returns a list with the rightmost genomic positions of all reads associated with the context.
+
+ Returns
+ -------
+ list of int or None
+ Rightmost genomic positions of all reads
+ """
if self.context_bam_reads is None:
return [None]
return [x.get_bam_read_ref_end() for x in self.context_bam_reads]
- # Returns a list of all right positions for all R2 BAM reads.
def get_context_bam_read_right_positions(self):
+ """Returns a list with the rightmost genomic positions for all reverse/R2 reads associated with the context.
+
+ Returns
+ -------
+ list of int
+ Rightmost genomic positions of all reverse/R2 reads
+ """
if self.context_bam_reads is None:
return [None]
return [x.get_bam_read_ref_end()
for x in self.context_bam_reads if (x.is_read2())]
- # Returns a list of all lengths for all BAM reads.
def get_context_bam_read_lengths(self):
+ """Returns a list with the lengths of all reads associated with the context.
+
+ Returns
+ -------
+ list of int
+ Lengths of all reads
+ """
return [x.get_bam_read_length() for x in self.context_bam_reads]
- # Returns a list of BAM read sequences in the current context.
def get_context_bam_read_seqs(self):
+ """Returns a list with the sequences of all reads associated with the context.
+
+ Returns
+ -------
+ list of str
+ Sequences of all reads
+ """
return [x.get_bam_read_sequence() for x in self.context_bam_reads]
- # Returns a list of qualities of all BAM reads.
def get_context_bam_read_qualities(self):
+ """Returns a list with the qualities lines of all reads associated with the context.
+
+ Return
+ ------
+ list of str
+ Quality line for all reads
+ """
return [x.get_bam_read_qual() for x in self.context_bam_reads]
- # Returns a list of Q-scores of all BAM reads.
def get_context_bam_read_q_scores(self):
+ """Returns a list with the Q-Scores of all reads associated with the context.
+
+ The Q-Scores for each read is an array of integers. The returned list is therefore a list with lists.
+
+
+ Returns
+ -------
+ list
+ Lists of Q-Scores for all reads
+ """
return [x.get_bam_read_q_scores() for x in self.context_bam_reads]
- # Returns a list of all BAM read MapQ values.
def get_context_bam_read_map_qs(self):
+ """Collects and returns the list of MAPQ values of all reads.
+
+ Returns
+ -------
+ list of int
+ List with MAPQ values
+ """
return [x.get_mapping_qual() for x in self.context_bam_reads]
- # Returns whether a BAM read is in the context based on the provided
- # read identifier.
def read_is_in_context(self, readid):
+ """Checks whether a read specified by an id is associated with the context.
+
+ Returns
+ -------
+ bool
+ True if the read in the context, False if not
+ """
if self.context_bam_reads is None:
return False
return readid in self.get_context_bam_read_ids()
# ===METHODS TO ADD/SET CONTEXT DATA=======================================
- # Adds the read id of a BAM read with an unmapped mate.
def add_unmapped_mate_id(self, ureadid):
+ """Adds the identifier of a read with an unmapped mate to the context unmapped mates list.
+
+ Parameters
+ -------
+ ureadid : str
+ Identifier of the read with an unmapped mate
+ """
self.unmapped_read_mate_ids.append(ureadid)
- # Sets the list of unmapped read mate ids.
def set_unmapped_mate_ids(self, mateids):
+ """Sets the provided read id list as the unmapped mate id list associated with the context.
+
+ Parameters
+ ----------
+ mateids : list of str
+ Read identifiers with unmapped mates
+ """
self.unmapped_read_mate_ids = mateids
- # Returns whether a BAM read in the context has an unmapped mate.
def read_has_unmapped_mate(self, readid):
+ """Checks if a read specified by a read identifier has an unmapped mate and returns True or False.
+
+ Returns
+ -------
+ bool
+ True if read has unmapped mate, False if not
+ """
return readid in self.unmapped_read_mate_ids
# ===STATISTICS METHODS FOR A VARIANT CONTEXT==============================
- # Returns the average and median read length.
def get_average_and_median_read_length(self):
+ """Calculates and return the mean and median red length of all reads associated with the context.
+
+ Returns
+ -------
+ list of int
+ Mean and median read length
+ """
if self.context_bam_reads is None:
return [None, None]
avgmedlen = []
@@ -153,8 +367,14 @@ def get_average_and_median_read_length(self):
avgmedlen.append(contextread.get_bam_read_length())
return [statistics.mean(avgmedlen), statistics.median(avgmedlen)]
- # Returns the average and median read quality.
def get_average_and_median_read_qual(self):
+ """Calculates and returns the mean and median Q-Score of all reads associated with the context.
+
+ Returns
+ -------
+ list of int
+ Mean and median read Q-Score
+ """
if self.context_bam_reads is None:
return [None, None]
avgmedqual = []
@@ -162,8 +382,14 @@ def get_average_and_median_read_qual(self):
avgmedqual.append(contextread.get_average_qscore())
return [statistics.mean(avgmedqual), statistics.median(avgmedqual)]
- # Returns the average and median read MapQ of this variant context.
def get_average_and_median_read_map_q(self):
+ """Calculates the mean and median MAPQ value of all reads associated with the context.
+
+ Returns
+ -------
+ list of int
+ Mean and median read MAPQ
+ """
if self.context_bam_reads is None:
return [None, None]
avgmedmapq = []
@@ -172,8 +398,14 @@ def get_average_and_median_read_map_q(self):
return [statistics.mean(avgmedmapq), statistics.median(avgmedmapq)]
# ===SOME OTHER METHODS====================================================
- # Returns a string representation of the overlap context.
def to_string(self):
+ """Assembles and returns a String representation of the context.
+
+ Returns
+ -------
+ str
+ String representation of the context
+ """
if self.context_bam_reads is None:
bamids = None
countbamreads = 0
@@ -189,8 +421,14 @@ def to_string(self):
+ str(countbamreads) + "\t"
+ str(bamids))
- # Returns a statistics string representation of the overlap context.
def to_statistics_string(self):
+ """Calculates some basic statistics of the context in a tab separated String.
+
+ Returns
+ -------
+ str
+ Line with statistics to write to file
+ """
avgmedlens = self.get_average_and_median_read_length()
avgmedquals = self.get_average_and_median_read_qual()
avgmedmapq = self.get_average_and_median_read_map_q()
@@ -198,9 +436,23 @@ def to_statistics_string(self):
f"{avgmedquals[0]}\t{avgmedquals[1]}\t{avgmedmapq[0]}\t"
f"{avgmedmapq[1]}")
- # Compares the current OverlapContext to another OverlapContext and
- # returns the differences.
def compare(self, other_overlap_context):
+ """Compares the current context to a provided context and returns the differences.
+
+ The context is compared to another context on each aspect. If they differ, the difference is saved in a
+ dictionary. The key is a numeric value, the difference an array. The first entry is the value of the current
+ context, the second entry is the value of the other context.
+
+ Parameters
+ ----------
+ other_overlap_context : OverlapContext
+ Acceptor/Donor context to compare this acceptor/donor context with
+
+ Returns
+ -------
+ differences : dict
+ Set of differences between the compared acceptor/donor contexts.
+ """
differences = {}
if self.context_id != other_overlap_context.get_context_id():
differences[1] = [self.context_id,
diff --git a/ParamChecker.py b/ParamChecker.py
index 3b2d568..e272c92 100644
--- a/ParamChecker.py
+++ b/ParamChecker.py
@@ -4,11 +4,46 @@
class ParamChecker:
- # Constructor that creates two empty arrays that
+ """Checks and saves the values of provided parameters by the user.
+
+ Attributes
+ ----------
+ vaselogger : Logger
+ Logs VaSeBuilder activity
+ vcf_filelist : str
+ Path to the variant list file
+ bam_filelist : str
+ Path to the alignment list file
+ acceptorbam : str
+ Path to the alignment file to use as acceptor
+ fastq_in1 : list of str
+ Paths to R1 fastq files to us as template
+ fastq_in2 : list of str
+ Paths to R2 fastq files to use as template
+ outdir : str
+ Path to directory to write output files to
+ reference_file : str
+ Path to the genome reference in fasta format
+ fastq_out_location : str
+ Path to write the validation fastq files to
+ varcon_out_location : str
+ Output name for the variant context file
+ log_location : str
+ Output path to write the log file to
+ variantlist_location : str
+ runmode : str
+ The mode to run VaSeBuilder in
+ varconin : str
+ Path to a variant context input file
+ donorfqlist : str
+ required_mode_parameters : dict
+ Contains the required parameters for each runmode
+ """
+
def __init__(self):
self.vaselogger = logging.getLogger("VaSe_Logger")
- self.vcf_filelist = []
- self.bam_filelist = []
+ self.vcf_filelist = ""
+ self.bam_filelist = ""
self.acceptorbam = ""
self.fastq_in1 = ""
self.fastq_in2 = ""
@@ -21,9 +56,8 @@ def __init__(self):
self.runmode = ""
self.varconin = ""
self.donorfqlist = ""
- self.required_mode_parameters = {"A": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin",
- "out"],
- "AC": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin",
+ self.random_seed = 2
+ self.required_mode_parameters = {"AC": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin",
"out"],
"D": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"],
"DC": ["runmode", "donorvcf", "donorbam", "out", "reference",
@@ -35,13 +69,24 @@ def __init__(self):
"P": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"],
"PC": ["runmode", "donorvcf", "donorbam", "out", "reference",
"varconin"],
- "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"],
- "XC": ["runmode", "donorvcf", "donorbam", "out", "reference",
- "varconin"]}
- self.optional_parameters = ["fastqout", "varcon", "variantlist"]
+ "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]}
+ self.optional_parameters = ["fastqout", "varcon", "variantlist", "seed"]
- # Checks whether the required parameters for a run mode have been set.
def required_parameters_set(self, runmode, vase_arg_vals):
+ """Checks and returns whether all required run mode parameters have been set.
+
+ Parameters
+ ----------
+ runmode : str
+ Runmode to check parameters for
+ vase_arg_vals : dict
+ Dictionary with parameters values
+
+ Returns
+ -------
+ bool
+ True if all parameters are correct, False if not
+ """
if runmode in self.required_mode_parameters:
for reqparam in self.required_mode_parameters[runmode]:
if vase_arg_vals[reqparam] is None:
@@ -53,15 +98,37 @@ def required_parameters_set(self, runmode, vase_arg_vals):
self.vaselogger.debug("Invalid run mode selected.")
return False
- # Checks whether the optional parameters have been set or not. If not, they will be set to None
def optional_parameters_set(self, vase_arg_vals):
+ """Checks and sets optional parameters if they have not been set.
+
+ Parameters
+ ----------
+ vase_arg_vals:
+ Dictionary with parameter values
+
+ Returns
+ -------
+ vase_arg_vals : dict
+ Updated dictionary with parameter values
+ """
for optparam in self.optional_parameters:
if optparam not in vase_arg_vals:
vase_arg_vals[optparam] = None
return vase_arg_vals
- # Check the logging parameter to determine where to write the logfile to.
def check_log(self, logparam):
+ """Checks the log parameter value and returns a proper output location to write the log file to.
+
+ Parameters
+ ----------
+ logparam : str
+ The parameter value for the log file
+
+ Returns
+ -------
+ self.log_location : str
+ Output path to write the log file to
+ """
logloc = "VaSeBuilder.log"
if logparam is not None:
@@ -77,8 +144,24 @@ def check_log(self, logparam):
self.log_location = logloc
return self.log_location
- # Checks whether provided folders exist.
def check_folders_exist(self, paramvals, file_exts):
+ """Checks and returns whether folder containing specified file types exist.
+
+ For each provided folder, it checks whether the folder contains at least one file with the specified file type.
+ If so, the folder is added to a list.
+
+ Parameters
+ ----------
+ paramvals : list of str
+ Folders to check for containing specified file type
+ file_exts : str
+ Extension of file types to check for
+
+ Returns
+ -------
+ existing_folders : list of str
+ Folders containing files with the specified extension
+ """
existing_folders = []
# Check whether the provided folders exist.
@@ -103,8 +186,20 @@ def check_folders_exist(self, paramvals, file_exts):
existing_folders.append(foldername)
return existing_folders
- # Checks whether at least one file with a provided extension (.vcf or .bam) is present.
def check_folder_contents(self, folder_to_check, file_exts):
+ """Counts and returns the number of files with a specified extension in a specified folder.
+
+ Parameters
+ ----------
+ folder_to_check:
+ Folder to check for files with
+ file_exts:
+ File extension to check files on
+ Returns
+ -------
+ vb_count : int
+ Number of specified files in folder
+ """
vb_count = 0
for vbfile in os.listdir(folder_to_check):
if vbfile.endswith(file_exts):
@@ -113,8 +208,19 @@ def check_folder_contents(self, folder_to_check, file_exts):
f"{file_exts[0][1:4].upper()} files")
return vb_count
- # Checks whether a provided file exists.
def check_file_exists(self, fileloc):
+ """Checks and returns whether one or more files exist.
+
+ Parameters
+ ----------
+ fileloc : str or list of str
+ Path to file or list of paths to files
+
+ Returns
+ -------
+ bool
+ True if file(s) exists.
+ """
if type(fileloc) == str:
fileloc = [fileloc]
for this_file in fileloc:
@@ -124,14 +230,36 @@ def check_file_exists(self, fileloc):
self.vaselogger.debug("Files all exist.")
return True
- # Return the directory name of an output location.
def is_valid_output_location(self, outfilename):
+ """Checks and returns whether the output location is valid.
+
+ Parameters
+ ----------
+ outfilename : str
+ Output file path to check
+
+ Returns
+ -------
+ """
return os.path.isdir(os.path.dirname(outfilename))
- # Checks whether the values of the parameters are correct (do files/folders exist for example).
- # [Function should perhaps be split into smaller functions]
def check_parameters(self, vase_arg_vals):
-
+ """Checks and returns whether the set parameters are ok.
+
+ For parameters with paths to input files, it is checked whether the file exists. For the output directory it is
+ checked whether the folder exists. Optional parameters, if in the parameter value set, are set to a default
+ value if none is given.
+
+ Parameters
+ ----------
+ vase_arg_vals : dict
+ Parameter values
+
+ Returns
+ -------
+ bool
+ True if set parameters are correct, False if not
+ """
# Loop over the provided parameters.
for param in vase_arg_vals:
self.vaselogger.debug(f"Checking parameter {param}")
@@ -217,7 +345,7 @@ def check_parameters(self, vase_arg_vals):
self.vaselogger.warning("Variant list parameter used but supplied variant list file "
f"{vase_arg_vals[param]} does not exist")
- # Checks if the provided donor fastq list file.
+ # Checks if the provided donor fastq list file exists.
if param == "donorfastqs":
if vase_arg_vals[param] is not None:
if os.path.isfile(vase_arg_vals[param]):
@@ -225,10 +353,36 @@ def check_parameters(self, vase_arg_vals):
else:
self.vaselogger.warning("Donor fastqs parameter used but supplied donorfastq list file"
f"{vase_arg_vals[param]} does not exist")
+
+ # Checks if the provided seed value is an integer or float
+ if param == "seed":
+ if vase_arg_vals[param] is not None:
+ if type(vase_arg_vals[param]) is int or type(vase_arg_vals[param]) is float:
+ self.random_seed = vase_arg_vals[param]
+ else:
+ self.random_seed = 2
return True
# Only checks whether the required runmode parameters are ok using 'check_parameters()'
def check_required_runmode_parameters(self, runmode, vaseargvals):
+ """Checks and returns whether all required run modes parameters are set and correct.
+
+ The parameter value set is first subsetted to only include all required and optional parameters. For run mode
+ 'X', the 'templatefq1' parameter is not required and would therefore be filtered out by this method even if it
+ has been set.
+
+ Parameters
+ ----------
+ runmode : str
+ Selected run mode
+ vaseargvals : dict
+ Parameter values
+
+ Returns
+ -------
+ bool
+ True if all required parameters are ok, False if not
+ """
if runmode in self.required_mode_parameters:
required_params = self.required_mode_parameters[runmode]
@@ -239,14 +393,46 @@ def check_required_runmode_parameters(self, runmode, vaseargvals):
return self.check_parameters(filtered_vaseargvals)
return False
- # Returns the name of the folder name of a parameter value (if the parameter value is ).
def get_folder_name(self, foldername):
+ """Returns the name of the folder for a given path.
+
+ If the provided path is a folder the the provided value is returned. If the provided path is a file, the path to
+ the parent folder is returned.
+
+ Parameters
+ ----------
+ foldername : str
+ Path to a valid folder
+
+ Returns
+ -------
+ str
+ Foldername parameter if foldername is a folder, the path to the parent folder if foldername is a file
+ """
if os.path.isfile(foldername) or (not os.path.isdir(foldername)):
return os.path.dirname(foldername)
return foldername
# Returns the name of an output file (is used for parameters fastqout, varcon, donorbread and acceptorbread).
def get_output_name(self, outfilename, defaultoutname):
+ """Checks and returns the name of an output file.
+
+ The output name is first checked whether it is a path. If so, only the last part (the filename) is returned. If
+ no filename or path has been provided, the provided default output name is returned. Only the name, and not
+ path, is returned as the output folder to write to is determined by the 'out' parameter.
+
+ Parameters
+ ----------
+ outfilename : str
+ The output file name
+ defaultoutname : str
+ The default output name to provide if non has been set/given
+
+ Returns
+ -------
+ str
+ Returns the output name
+ """
if outfilename is not None:
if "/" in outfilename:
return outfilename.split("/")[-1]
@@ -255,60 +441,221 @@ def get_output_name(self, outfilename, defaultoutname):
return outfilename
return defaultoutname
- # Returns the list of valid VCF folders.
def get_valid_vcf_filelist(self):
+ """Returns the location of the text file containing a list of donor variant files to use.
+
+ Returns
+ -------
+ self.vcf_filelist : str
+ Path to the variant list file
+ """
return self.vcf_filelist
- # Returns the list of valid BAM folders.
def get_valid_bam_filelist(self):
+ """Returns the location of the text file containing a list of donor alignment files to use.
+
+ Returns
+ -------
+ self.bamfile_list : str
+ Path to the alignment list file
+ """
return self.bam_filelist
- # Returns the location of the acceptor BAM file location.
def get_acceptor_bam(self):
+ """Returns the location of the BAM/CRAM file that will be used as the acceptor.
+
+ Returns
+ -------
+ self.acceptorbam : str
+ Path to the alignment file to use as acceptor
+ """
return self.acceptorbam
- # Returns the location and name of the first (R1) fastq input file location.
def get_first_fastq_in_location(self):
+ """Returns the paths to R1 fastq files that serve as the template for the validation set.
+
+ Returns
+ -------
+ self.fastq_in1 : list of str
+ Paths to R1 template fastq files
+ """
return self.fastq_in1
- # Returns the location and name of the second (R2) fastq input file location.
def get_second_fastq_in_location(self):
+ """Returns the paths to R2 fastq files that serve as the template for the validation set.
+
+ Returns
+ -------
+ self.fastq_in2 : list of str
+ Paths to R2 template fastq files
+ """
return self.fastq_in2
- # Returns the location(s) and names of the two (R1 and R2) fastq input files.
def get_fastq_in_locations(self):
+ """
+
+ Returns
+ -------
+ list
+ """
return [self.fastq_in1, self.fastq_in2]
- # Returns the location to write the output to.
def get_out_dir_location(self):
+ """Returns the directory to write the output files. Adds '/' if not present in the ouput directory location.
+
+ Returns
+ -------
+ str
+ Path to the output directory
+ """
if self.outdir.endswith("/"):
return self.outdir
return self.outdir + "/"
- # Returns the location of the reference fasta file
def get_reference_file_location(self):
+ """Returns the location of the genomic fasta reference file.
+
+ Returns
+ -------
+ self.reference_file : str
+ Path to the genomic reference fasta file
+ """
return self.reference_file
# Returns the location of the FastQ file that will be produced by VaSeBuilder.
def get_fastq_out_location(self):
+ """Returns the fastq out location and suffix.
+
+ Before returning the path and output suffix, it is checked whether the path to the outdir location ends with a
+ '/'. If not this is added. The path and suffix is not a full output path. The full output path is constructed
+ when writing the new validation fastq files.
+
+ Returns
+ -------
+ str
+ Path and suffix for the fastq out files
+ """
if self.outdir.endswith("/"):
return self.outdir + self.fastq_out_location
return self.outdir + "/" + self.fastq_out_location
# Returns the location of file that will contain the variants and their context start and stops.
def get_variant_context_out_location(self):
+ """Constructs and returns the output path for the variant context file.
+
+ Returns
+ -------
+ str
+ Path to the variant context output location
+ """
if self.outdir.endswith("/"):
return self.outdir + self.varcon_out_location
return self.outdir + "/" + self.varcon_out_location
- # Retuns the location to write the log file(s) to.
def get_log_file_location(self):
+ """Returns the location the log file will be written to.
+
+ Returns
+ -------
+ self.log_location : str
+ Path to write the log file to
+ """
return self.log_location
- # Returns the variant list location
def get_variant_list_location(self):
+ """Returns the path to the file containing variants to use.
+
+ Returns
+ -------
+ self.variantlist_location : str
+ Path to the variant list file
+ """
return self.variantlist_location
- # Returns the location of the list file containing donor fastq files
def get_donorfqlist(self):
+ """Returns the location of the list file containing donor fastq files.
+
+ Returns
+ -------
+ self.donorfqlist : str
+ Path to the donor fastq list file
+ """
return self.donorfqlist
+
+ def get_runmode(self):
+ """Returns the value of the run mode parameter.
+
+ Returns
+ -------
+ self.runmode : str
+ The set runmode
+ """
+ return self.runmode
+
+ def get_random_seed_value(self):
+ """Returns the random seed value.
+
+ Returns
+ -------
+ self.random_seed : int or float
+ Value to use as seed for semi random distribution of donor reads of validation fastq files
+ """
+ return self.random_seed
+
+ def get_variantcontext_infile(self):
+ """Returns path to the variant context input file.
+
+ Returns
+ -------
+ self.varconin : str
+ Path to the variant context input file
+ """
+ return self.varconin
+
+ def get_required_runmode_parameters(self, runmode):
+ """Returns the required parameters for a specified run mode.
+
+ Parameters
+ ----------
+ runmode : str
+ The specified run mode
+
+ Returns
+ -------
+ list of str
+ List with the names of the required parameters
+ """
+ if runmode.upper() in self.required_mode_parameters:
+ return self.required_mode_parameters[runmode.upper()]
+ return []
+
+ def get_optional_parameters(self):
+ """Returns the optional parameters
+
+ Returns
+ -------
+ optional_parameters : list of str
+ The list of optional program parameters
+ """
+ return self.optional_parameters
+
+ def filter_provided_parameters(self, runmode, vase_parameters):
+ """Filters and returns a parameter set with only the required and optional parameters
+
+ Parameters
+ ----------
+ runmode : str
+ The specified run mode
+ vase_parameters : str
+ Dictionary with parameter values (not used)
+
+ Returns
+ -------
+ filtered_parameter_set : dict
+ Parameter setf
+ """
+ filtered_parameter_set = {}
+ runmode_reqparams = self.get_optional_parameters()
+ if runmode in self.required_mode_parameters:
+ runmode_reqparams.extend(self.required_mode_parameters[runmode])
+ return filtered_parameter_set
diff --git a/ReadIdObject.py b/ReadIdObject.py
index b057958..58f2742 100644
--- a/ReadIdObject.py
+++ b/ReadIdObject.py
@@ -1,10 +1,76 @@
class ReadIdObject:
+ """The ReadIdObject is used to save the read identifiers.
+
+ This class is used when a VariantContextFile is read. DonorBamRead can't be used as most data is missing.
+
+ Attributes
+ ----------
+ read_id : str
+ Identifier of the read
+ pair_number : str
+ Pair number of the read
+ """
def __init__(self, readid):
+ """Constructor saves the provided read identifier and sets the pairnumber to a default empty value."""
self.read_id = readid
+ self.pair_number = ""
- # Returns the name of the
def get_bam_read_id(self):
+ """Returns the identifier of the read.
+
+ Returns
+ -------
+ self.read_id : str
+ Identifier of the read
+ """
return self.read_id
def set_bam_read_id(self, readid):
+ """Sets the identifier of the read. Overwrites any already existing value.
+
+ Parameters
+ ----------
+ readid : str
+ The read identifier to set
+ """
self.read_id = readid
+
+ def get_bam_read_pair_number(self):
+ """Returns the read pair number of the read.
+
+ Returns
+ -------
+ self.pair_number : str
+ The read pair number
+ """
+ return self.pair_number
+
+ def set_bam_read_pair_number(self, pairnum):
+ """Sets the read pair number of the read. Overwrites any already existing value.
+
+ Parameters
+ ----------
+ pairnum : str
+ The pair number to set
+ """
+ self.pair_number = pairnum
+
+ def is_read1(self):
+ """Returns whether the read is the forward/R1 read.
+
+ Returns
+ -------
+ bool
+ True if read pair number is 1., False if not
+ """
+ return self.pair_number == "1"
+
+ def is_read2(self):
+ """Returns whether the read is the reverse/R2 read.
+
+ Returns
+ -------
+ bool
+ True if read pair number is 2, Fale if not
+ """
+ return self.pair_number == "2"
diff --git a/VaSeBuilder.py b/VaSeBuilder.py
index f788fc7..c5489e6 100644
--- a/VaSeBuilder.py
+++ b/VaSeBuilder.py
@@ -6,15 +6,31 @@
import numpy as np
import pysam
import time
+import random
# Import VaSe specific classes.
from VcfBamScanner import VcfBamScanner
from DonorBamRead import DonorBamRead
from VariantContextFile import VariantContextFile
from VcfVariant import VcfVariant
+from VariantContext import VariantContext
+from OverlapContext import OverlapContext
class VaSeBuilder:
+ """Creates the variant contexts, builds validation sets and writes the output files.
+
+ Attributes
+ ----------
+ vb_scanner : VcfBamScanner
+ Scans and extracts info from variant and alignment files
+ creation_id : str
+ Unique (uuid) identifier to identify VaSeBuilder runs
+ creation_time : str
+ The date and time of creation to identify VaSeBuilder runs
+ vaselogger : Logger
+ VaSeBuilder logger to log VaSeBuilder activity
+ """
# Constructor that saves the identifier, date, and time of the current run.
def __init__(self, vaseid):
self.vaselogger = logging.getLogger("VaSe_Logger")
@@ -22,11 +38,10 @@ def __init__(self, vaseid):
self.creation_id = str(vaseid)
self.creation_time = datetime.now()
self.vaselogger.info(
- f"VaSeBuilder: {self.creation_id} ; {self.creation_time}"
- )
+ f"VaSeBuilder: {self.creation_id} ; {self.creation_time}"
+ )
- # VariantContextFile that saves the acceptor, donor,
- # and variant contexts with their associated data.
+ # VariantContextFile that saves the acceptor, donor, and variant contexts with their associated data.
self.contexts = VariantContextFile()
# Dictionary used for debug messages.
@@ -40,6 +55,17 @@ def __init__(self, vaseid):
"car": "Gathering combined context acceptor reads",
"done": "Variant complete. Processing"}
+ self.cigar_tuple_table = {"M": 0,
+ "I": 1,
+ "D": 2,
+ "N": 3,
+ "S": 4,
+ "H": 5,
+ "P": 6,
+ "-": 7,
+ "X": 8,
+ "B": 9}
+
# Method to print debug messages.
def debug_msg(self, step, variant_id, t0=False):
process = self.debug_dict[step]
@@ -68,6 +94,30 @@ def build_varcon_set(self, sampleidlist,
variant locus, and read IDs per variant in an external file, along with
other relevant information. Optionally outputs additional statistics
files.
+
+ Parameters
+ ----------
+ sampleidlist : list of str
+ Sample names/identifiers to process
+ vcfsamplemap : dict
+ Variant files per sample
+ bamsamplemap : dict
+ Alignment files per sample
+ acceptorbamloc : str
+ Path to the alignment file to use as acceptor
+ outpath : str
+ Path to folderto write outut files to
+ reference_loc : str
+ Path to genome reference fasta file
+ varcon_outpath : str
+ Path and suffix to write variant context output file to
+ variant_list : str
+ Path to file containing variants to process
+
+ Returns
+ -------
+ self.contexts : dict
+ Variant contexts per variant context identifier
"""
self.vaselogger.info("Begin building the variant context file.")
donor_vcfs_used = []
@@ -76,9 +126,9 @@ def build_varcon_set(self, sampleidlist,
# Open the acceptor bam, or exit if this fails.
try:
acceptorbamfile = pysam.AlignmentFile(
- acceptorbamloc,
- reference_filename=reference_loc
- )
+ acceptorbamloc,
+ reference_filename=reference_loc
+ )
except IOError:
self.vaselogger.critical("Could not open acceptor BAM file. "
"Exitting.")
@@ -95,9 +145,9 @@ def build_varcon_set(self, sampleidlist,
if sampleid in variant_list:
sample_variant_filter = variant_list[sampleid]
samplevariants = self.get_sample_vcf_variants(
- vcfsamplemap[sampleid],
- sample_variant_filter
- )
+ vcfsamplemap[sampleid],
+ sample_variant_filter
+ )
# Skip this sample if all of its variants got filtered out.
if not samplevariants:
@@ -109,9 +159,9 @@ def build_varcon_set(self, sampleidlist,
# for this sample, or skip this samples if this fails.
try:
bamfile = pysam.AlignmentFile(
- bamsamplemap[sampleid],
- reference_filename=reference_loc
- )
+ bamsamplemap[sampleid],
+ reference_filename=reference_loc
+ )
self.vaselogger.debug("Opened BAM file "
f"{bamsamplemap[sampleid]}")
except IOError:
@@ -135,44 +185,35 @@ def build_varcon_set(self, sampleidlist,
vchr = vcfvar.get_variant_chrom()
vpos = vcfvar.get_variant_pos()
- # Determine whether the variant is SNP or indel,
- # based on the length of its alleles.
+ # Determine whether the variant is SNP or indel, based on the length of its alleles.
self.debug_msg("vw", variantid)
vtype = vcfvar.get_variant_type()
-
- self.vaselogger.debug(f"Variant {variantid} "
- f"determined to be a {vtype}.")
+ self.vaselogger.debug(f"Variant {variantid} determined to be a {vtype}.")
# Determine the length of the search window
# based on the length of the variant.
searchwindow = self.determine_read_search_window(vtype, vcfvar)
- self.vaselogger.debug(
- "Search window determined to be "
- f"{vchr}:{searchwindow[0]+1}-{searchwindow[1]}."
- )
+ self.vaselogger.debug(f"Search window determined to be {vchr}:{searchwindow[0]+1}-{searchwindow[1]}.")
- # Check if this window overlaps any already in-use contexts;
- # If so, skip this variant.
+ # Check if this window overlaps any already in-use contexts; If so, skip this variant.
if self.contexts.variant_is_in_context(vtype, vchr, *searchwindow):
- self.vaselogger.debug(f"Variant {variantid} is located in "
- "an existing variant context.")
+ self.vaselogger.debug(f"Variant {variantid} is located in an existing variant context.")
continue
# Begin fetching reads and determining contexts.
try:
# === DONOR =====================================
- # Obtain all donor BAM reads at the variant
- # position, as well as their mates.
+ # Obtain all donor BAM reads at the variant position, as well as their mates.
self.debug_msg("dr", variantid)
t0 = time.time()
donor_context_reads = (
- self.get_variant_reads(variantid,
- vchr,
- *searchwindow,
- bamfile,
- True,
- donor_unmapped)
- )
+ self.get_variant_reads(variantid,
+ vchr,
+ *searchwindow,
+ bamfile,
+ True,
+ donor_unmapped)
+ )
self.debug_msg("dr", variantid, t0)
# If no donor reads were found at this position,
@@ -181,7 +222,7 @@ def build_varcon_set(self, sampleidlist,
self.vaselogger.info(
f"No reads found for variant {variantid}"
f" in donor {sampleid}; Skipping."
- )
+ )
continue
# Determine the donor context based on
@@ -190,7 +231,7 @@ def build_varcon_set(self, sampleidlist,
t0 = time.time()
donor_context = (
self.determine_context(donor_context_reads, vpos, vchr)
- )
+ )
self.debug_msg("dc", variantid, t0)
# === ACCEPTOR ============================================
@@ -199,24 +240,24 @@ def build_varcon_set(self, sampleidlist,
self.debug_msg("ar", variantid)
t0 = time.time()
acceptor_context_reads = (
- self.get_variant_reads(variantid,
- vchr,
- *searchwindow,
- acceptorbamfile,
- True,
- acceptor_unmapped)
- )
+ self.get_variant_reads(variantid,
+ vchr,
+ *searchwindow,
+ acceptorbamfile,
+ True,
+ acceptor_unmapped)
+ )
self.debug_msg("ar", variantid, t0)
# Only perform the following if no reads were
# found in the acceptor at the variant location.
if not acceptor_context_reads:
self.vaselogger.warning(
- f"No reads found for variant {variantid} in "
- "acceptor. Acceptor and donor sequencing may "
- "have been performed with different methods. "
- "Proceeding anyway."
- )
+ f"No reads found for variant {variantid} in "
+ "acceptor. Acceptor and donor sequencing may "
+ "have been performed with different methods. "
+ "Proceeding anyway."
+ )
# Create a dummy list of reads.
acceptor_context_reads = None
# Temporarily set acceptor context equal to donor.
@@ -233,7 +274,7 @@ def build_varcon_set(self, sampleidlist,
self.determine_context(acceptor_context_reads,
vpos,
vchr)
- )
+ )
self.debug_msg("ac", variantid, t0)
# === COMBINED CONTEXT ====================================
@@ -243,20 +284,20 @@ def build_varcon_set(self, sampleidlist,
self.debug_msg("cc", variantid)
t0 = time.time()
variant_context = (
- self.determine_largest_context(vpos,
- acceptor_context,
- donor_context)
- )
+ self.determine_largest_context(vpos,
+ acceptor_context,
+ donor_context)
+ )
self.debug_msg("cc", variantid, t0)
# If this widest context overlaps an
# existing variant context, skip it.
if self.contexts.context_collision(variant_context):
self.vaselogger.debug(
- f"Variant context {variantid} overlaps "
- "with an already existing variant context; "
- "Skipping."
- )
+ f"Variant context {variantid} overlaps "
+ "with an already existing variant context; "
+ "Skipping."
+ )
continue
# Obtain all donor reads overlapping the
@@ -264,14 +305,14 @@ def build_varcon_set(self, sampleidlist,
self.debug_msg("cdr", variantid)
t0 = time.time()
variant_context_donor_reads = (
- self.get_variant_reads(variantid,
- variant_context[0],
- variant_context[2],
- variant_context[3],
- bamfile,
- True,
- varcon_unmapped_d)
- )
+ self.get_variant_reads(variantid,
+ variant_context[0],
+ variant_context[2],
+ variant_context[3],
+ bamfile,
+ True,
+ varcon_unmapped_d)
+ )
self.debug_msg("cdr", variantid, t0)
# Obtain all acceptor reads overlapping the
@@ -279,14 +320,14 @@ def build_varcon_set(self, sampleidlist,
self.debug_msg("car", variantid)
t0 = time.time()
variant_context_acceptor_reads = (
- self.get_variant_reads(variantid,
- variant_context[0],
- variant_context[2],
- variant_context[3],
- acceptorbamfile,
- True,
- varcon_unmapped_a)
- )
+ self.get_variant_reads(variantid,
+ variant_context[0],
+ variant_context[2],
+ variant_context[3],
+ acceptorbamfile,
+ True,
+ varcon_unmapped_a)
+ )
self.debug_msg("car", variantid, t0)
# If still no acceptor reads were found in the combined
@@ -297,45 +338,45 @@ def build_varcon_set(self, sampleidlist,
# Add the combined, donor, and acceptor contexts along
# with their reads to the current list of contexts.
self.contexts.add_variant_context(
- variantid,
- sampleid,
- *variant_context,
- variant_context_acceptor_reads,
- variant_context_donor_reads
- )
+ variantid,
+ sampleid,
+ *variant_context,
+ variant_context_acceptor_reads,
+ variant_context_donor_reads
+ )
self.contexts.add_donor_context(
- variantid,
- sampleid,
- *donor_context,
- donor_context_reads
- )
+ variantid,
+ sampleid,
+ *donor_context,
+ donor_context_reads
+ )
self.contexts.add_acceptor_context(
- variantid,
- sampleid,
- *acceptor_context,
- acceptor_context_reads
- )
+ variantid,
+ sampleid,
+ *acceptor_context,
+ acceptor_context_reads
+ )
# Add the read identifiers of reads with
# unmapped mates.
self.contexts.set_unmapped_acceptor_mate_ids(
- variantid,
- varcon_unmapped_a
- )
+ variantid,
+ varcon_unmapped_a
+ )
self.contexts.set_unmapped_donor_mate_ids(
- variantid,
- varcon_unmapped_d
- )
+ variantid,
+ varcon_unmapped_d
+ )
self.contexts.set_acceptor_context_unmapped_mate_ids(
- variantid,
- acceptor_unmapped
- )
+ variantid,
+ acceptor_unmapped
+ )
self.contexts.set_donor_context_unmapped_mate_ids(
- variantid,
- donor_unmapped
- )
+ variantid,
+ donor_unmapped
+ )
except IOError:
self.vaselogger.warning("Could not obtain BAM reads from "
@@ -357,7 +398,7 @@ def build_varcon_set(self, sampleidlist,
# Write the variant contexts to file.
self.vaselogger.info("Writing variant contexts to "
f"{varcon_outpath}")
- self.contexts.write_variant_context_file(varcon_outpath)
+ self.contexts.write_variant_context_file(varcon_outpath, self.creation_id)
# Write the variant context statistics file.
self.vaselogger.info("Writing variant context statistics to "
@@ -388,7 +429,7 @@ def build_varcon_set(self, sampleidlist,
if self.vaselogger.getEffectiveLevel() == 10:
self.write_optional_output_files(outpath, self.contexts)
acceptorbamfile.close()
- return
+ return self.contexts
def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc):
"""Reads an existing variant context file and fetches donor reads from
@@ -398,6 +439,12 @@ def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc):
set from a pre-built variant context file. Does not output a new
variant context file, and does not store any read information for the
acceptor except for read IDs per variant.
+
+ Parameters
+ ----------
+ varc_file
+ bamsamplemap
+ reference_loc
"""
# Reads in an existing variant context file.
self.contexts = VariantContextFile(varc_file)
@@ -433,7 +480,7 @@ def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc):
context.variant_context_start,
context.variant_context_end,
bamfile)
- )
+ )
# Close the sample's BAM file when done.
bamfile.close()
return
@@ -450,7 +497,6 @@ def build_validation_set(self, run_mode, acceptor_bam,
Depending on mode, will either output combined acceptor/donor FastQs,
or donor-only FastQs.
"""
-
# Per-variant Fq output functionality.
if "P" in run_mode:
self.vaselogger.info(f"Begin writing variant FastQ files.")
@@ -470,8 +516,8 @@ def build_validation_set(self, run_mode, acceptor_bam,
if "F" in run_mode:
# Set up a set of all acceptor fastq reads to skip.
skip_list = set(
- self.contexts.get_all_variant_context_acceptor_read_ids()
- )
+ self.contexts.get_all_variant_context_acceptor_read_ids()
+ )
for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]):
# Write the fastq files.
@@ -488,8 +534,26 @@ def build_validation_set(self, run_mode, acceptor_bam,
self.vaselogger.info("Finished writing FastQ files.")
return
- # Checks whether a value is in a filter list (array or set)
def passes_filter(self, val_to_check, filter_to_use, is_exclude_filter=False):
+ """Checks whether a value is in a filter list.
+
+ A value can be checked against either an inclusion or exclusion filter. An inclusion filter should be the values
+ that should be included and used, an exclusion filter for values to be excluded and not used.
+
+ Parameters
+ ----------
+ val_to_check : str
+ Value to check against filter
+ filter_to_use : list of str
+ Values to use as filter
+ is_exclude_filter : bool
+ Is the filter an exclusion filter (false for inclusion filter)
+
+ Returns
+ -------
+ bool
+ True if value in an inclusive filter or not in an exclusive filter, False otherwise
+ """
if filter_to_use is not None:
if is_exclude_filter:
if val_to_check in filter_to_use:
@@ -503,6 +567,20 @@ def passes_filter(self, val_to_check, filter_to_use, is_exclude_filter=False):
# Returns a list of VcfVariant objects from a VCF file. A filter can be used to select specific variants.
def get_sample_vcf_variants(self, vcf_fileloc, filterlist=None):
+ """Reads and returns read variants from a variant file.
+
+ Parameters
+ ----------
+ vcf_fileloc : str
+ Path to variant file to read
+ filterlist : list of tuple
+ Variants to include
+
+ Returns
+ -------
+ sample_variant_list : list of VcfVariant
+ Read variants fro the variant file
+ """
sample_variant_list = []
try:
vcf_file = pysam.VariantFile(vcf_fileloc, "r")
@@ -517,8 +595,24 @@ def get_sample_vcf_variants(self, vcf_fileloc, filterlist=None):
finally:
return sample_variant_list
- # Returns whether a variant is a SNP or indel.
def determine_variant_type(self, vcfvariantref, vcfvariantalts):
+ """Determines and returns the variant type.
+
+ The type of variant is determined based on the lengths of the reference and alternative allele(s). Currently
+ only SNP or indel is returned aas variant type.
+
+ Parameters
+ ----------
+ vcfvariantref : str
+ Variant reference allele(s)
+ vcfvariantalts : tuple of str
+ Variant alternative allele(s)
+
+ Returns
+ -------
+ str
+ Variant type
+ """
# Determine the maximum reference allele length.
maxreflength = max([len(x) for x in vcfvariantref.split(",")])
# Determine the maximum alternative allele length.
@@ -532,9 +626,25 @@ def determine_variant_type(self, vcfvariantref, vcfvariantalts):
return "indel"
return "?"
- # Determines the start and end positions to use for searching reads
- # overlapping with the variant.
def determine_read_search_window(self, varianttype, vcfvariant):
+ """Determines and returns the search window for fetching reads.
+
+ The determined search window depends on the provided variant type. SNPs return a search window of SNP position
+ -1 and +1. Indels return a search window based indel lefmost position and length. If the variant type is
+ neither SNP or indel, the search window returned will be [-1, -1].
+
+ Parameters
+ ----------
+ varianttype : str
+ Variant type to determine search window for
+ vcfvariant : VcfVariant
+ Variant to determine search window with
+
+ Returns
+ -------
+ list of int
+ Search window start and stop, -1 and -1 if variant type is invalid
+ """
if varianttype == "snp":
return [vcfvariant.get_variant_pos() - 1, vcfvariant.get_variant_pos() + 1]
elif varianttype == "indel":
@@ -543,80 +653,232 @@ def determine_read_search_window(self, varianttype, vcfvariant):
vcfvariant.get_variant_alt_alleles())
return [-1, -1]
- # Returns the search start and stop to use for searching BAM reads
- # overlapping with the range of the indel.
+ # Returns the search start and stop to use for searching BAM reads overlapping with the range of the indel.
def determine_indel_read_range(self, variantpos, variantref, variantalts):
+ """Determines and returns the search start and stop to use for an indel.
+
+ Parameters
+ ----------
+ variantpos : int
+ Leftmost genomic position of the variant
+ variantref : str
+ Variant reference allele(s)
+ variantalts : tuple
+ Variant alternative allele(s)
+
+ Returns
+ -------
+ list of int
+ Search window start and stop
+ """
searchstart = variantpos
searchstop = variantpos + max(
- max([len(x) for x in variantref.split(",")]),
- max([len(x) for x in variantalts])
- )
+ max([len(x) for x in variantref.split(",")]),
+ max([len(x) for x in variantalts])
+ )
return [searchstart, searchstop]
- # Returns the BAM reads containing the specific vcf variant as well
- # as their read mate.
def get_variant_reads(self, contextid, variantchrom,
variantstart, variantend,
bamfile,
write_unm=False, umatelist=None):
- # Obtain all the variant reads overlapping with the variant and
- # their mate reads.
+ """Fetches and returns reads overlapping with a specified variant.
+
+ First reads overlapping directly with the variant position are fetched. Then the read mates are fetched using
+ the RNEXT and PNEXT values of each read. Lastly, it is ensured that each read only occurs once.
+
+ Parameters
+ ----------
+ contextid : str
+ Identifier of the context to fetch reads for
+ variantchrom : str
+ Chromosome name to fetch reads from
+ variantstart : int
+ Leftmost genomic position to use for fetching
+ variantend : int
+ Rightmost genomic position to use for fetching
+ bamfile : pysam.AlignmentFile
+ Already opened pysam AlignmentFile
+ write_unm : bool
+ Save read identifiers with an unmapped mate
+ umatelist : list of str
+ Identifiers of reads with an unmapped mate
+
+ Returns
+ -------
+ variantreads : list of DonorBamRead
+ Fetched reads and their read mates
+ """
+ hardclipped_read_num = 0
+ duplicate_read_num = 0
+ secondary_read_num = 0
+
+ read_objects = []
variantreads = []
- rpnext = {}
+ clipped_reads = []
+ list_r1 = []
+ list_r2 = []
for vread in bamfile.fetch(variantchrom, variantstart, variantend):
- variantreads.append(DonorBamRead(
- vread.query_name,
- self.get_read_pair_num(vread),
- vread.reference_name,
- vread.reference_start,
- vread.infer_read_length(),
- vread.get_forward_sequence(),
- "".join([chr((x + 33))
- for x in vread.get_forward_qualities()]),
- vread.mapping_quality
- ))
- rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name]
-
- # Obtain the read mate (this must be done after the initial fetch not during!)
- for read1 in rpnext.values():
- materead = self.fetch_mate_read(*read1, bamfile)
- if materead is not None:
- variantreads.append(materead)
- else:
- if write_unm:
- self.vaselogger.debug("Could not find mate for "
- f"{read1[2]} ; "
- "mate is likely unmapped.")
- umatelist.append(read1[2])
-
- # Make sure the list only contains each BAM read once (if a read
- # and mate both overlap with a variant, they have been added
- # twice to the list).
- uniq_variantreads = []
+ if vread.is_duplicate:
+ duplicate_read_num += 1
+ if vread.is_secondary:
+ secondary_read_num += 1
+ if self.read_is_hard_clipped(vread):
+ hardclipped_read_num += 1
+ clipped_reads.append(vread)
+ continue
+ read_objects.append(vread)
+
+ self.vaselogger.debug(f"Fetched {hardclipped_read_num} reads with hardclipped bases")
+ for clipped in clipped_reads:
+ read_objects.append(self.fetch_primary_from_secondary(clipped, bamfile))
+
+ for vread in read_objects:
+ if vread.is_read1:
+ list_r1.append(vread)
+ elif vread.is_read2:
+ list_r2.append(vread)
+
+ list_r1_ids = [x.query_name for x in list_r1]
+ list_r2_ids = [x.query_name for x in list_r2]
+
+ for r1 in list_r1:
+ if r1.query_name not in list_r2_ids:
+ list_r2.append(self.fetch_mate_read(r1.query_name, r1.next_reference_name, r1.next_reference_start,
+ self.get_read_pair_num(r1), bamfile))
+ for r2 in list_r2:
+ if r2.query_name not in list_r1_ids:
+ list_r1.append(self.fetch_mate_read(r2.query_name, r2.next_reference_name, r2.next_reference_start,
+ self.get_read_pair_num(r2), bamfile))
+
+ for vread in list_r1 + list_r2:
+ variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread),
+ vread.reference_name, vread.reference_start, vread.infer_read_length(),
+ vread.reference_end, vread.cigarstring, vread.next_reference_name,
+ vread.next_reference_start,
+ vread.template_length, vread.get_forward_sequence(),
+ "".join([chr((x + 33)) for x in vread.get_forward_qualities()]),
+ vread.mapping_quality))
+
+ variantreads = self.uniqify_variant_reads(variantreads)
+ return variantreads
+
+ def fetch_primary_from_secondary(self, secondary_read, bamfile):
+ """Fetches and returns the primary alignment of a read based on the position recorded in its SA tag.
+
+ The SA tag is read from a provided Pysam read object. Then, reads at the recorded alignment position are
+ fetched. The first non-secondary alignment with a matching read ID and pair number is returned.
+
+ Parameters
+ ----------
+ secondary_read : pysam.AlignedSegment
+ Secondary alignment whose primary alignment will be located
+ bamfile : pysam.AlignmentFile
+ Already opened pysam AlignmentFile
+
+ Returns
+ -------
+ primary_read : pysam.AlignedSegment
+ Primary alignment with the same read ID and pair number as the input secondary alignment.
+ """
+ primary_locus = secondary_read.get_tag("SA").split(",")[0:2]
+ for fetched in bamfile.fetch(primary_locus[0], int(primary_locus[1]) - 1, int(primary_locus[1])):
+ if ((fetched.query_name == secondary_read.query_name)
+ and (fetched.is_read1 == secondary_read.is_read1)
+ and not fetched.is_secondary):
+ primary_read = fetched
+ break
+ return primary_read
+
+# =============================================================================
+# def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None):
+# """Fetches and returns read mates for a set of reads.
+#
+# Read mates are fetched from an already opened pysam alignment file using the RNEXT and PNEXT values as well as
+# the read identifier.
+#
+# Parameters
+# ----------
+# rpnextmap : dict
+# bamfile : pysam.AlignmentFile
+# Already opened pysam alingment file to fetch read mates from
+# variantreadlist : list of DonorBamRead
+# Reads to fetch read mates for
+# write_unm : bool
+# Write identifiers of reads with unmapped mates
+# umatelist : list of str
+# Identifiers with reads that have an unmapped mate
+#
+# Returns
+# -------
+# variantreadlist : list of DonorBamRead
+# Updated list of reads and the added read mates
+# """
+# hardclipped_read_num = 0
+# for read1 in rpnextmap.values():
+# materead = self.fetch_mate_read(*read1, bamfile)
+# if materead is not None:
+# if materead.read_has_hardclipped_bases():
+# hardclipped_read_num += 1
+# variantreadlist.append(materead)
+# else:
+# if write_unm:
+# self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.")
+# umatelist.append(read1[2])
+# self.vaselogger.debug(f"Fetched {hardclipped_read_num} read mates with hardclipped bases")
+# return variantreadlist
+# =============================================================================
+
+ def uniqify_variant_reads(self, variantreads):
+ """Ensures each read only occurs once and returns the updates set.
+
+ Parameters
+ ----------
+ variantreads : list of DonorBamRead
+ Sets of reads to process
+
+ Returns
+ -------
+ unique_variantreads : list of DonorBamRead
+ Set of reads with each read occuring only once
+ """
+ unique_variantreads = []
checklist = []
for fetched in variantreads:
id_pair = (fetched.get_bam_read_id(), fetched.get_bam_read_pair_number())
if id_pair not in checklist:
- uniq_variantreads.append(fetched)
+ unique_variantreads.append(fetched)
checklist.append(id_pair)
- variantreads = uniq_variantreads
-
- # Filter to keep only read pairs.
- variantreads = self.filter_variant_reads(variantreads)
- self.vaselogger.debug(f"Found a total of {len(variantreads)} "
- "BAM reads.")
- return variantreads
-
- # Returns the mate read using the fetch() method
- def fetch_mate_read(self, rnext, pnext, readid, bamfile):
+ return unique_variantreads
+
+ def fetch_mate_read(self, readid, rnext, pnext, pair_num, bamfile):
+ """Fetches and returns the mate read of a specified read.
+
+ The mate read is fetched from an opened alignment file using the RNEXT and PNEXT value of the read. Of the
+ fetched reads, only the read with the same identifier is returned.
+
+ Parameters
+ ----------
+ rnext : str
+ Chromosome name the mate read is located on
+ pnext : int
+ Leftmost genomic position of the mate read
+ readid : str
+ Identifier of the read to search the mate for
+ bamfile : pysam.AlignmentFile
+ Already openend pysam alignment file
+
+ Returns
+ -------
+ DonorBamRead or None
+ The mate read if found, None if not
+ """
for bamread in bamfile.fetch(rnext, pnext, pnext + 1):
- if bamread.query_name == readid and bamread.reference_start == pnext:
- return DonorBamRead(bamread.query_name, self.get_read_pair_num(bamread), bamread.reference_name,
- bamread.reference_start, bamread.infer_read_length(),
- bamread.get_forward_sequence(),
- "".join([chr((x + 33)) for x in bamread.get_forward_qualities()]),
- bamread.mapping_quality)
+ if (bamread.query_name == readid
+ and bamread.reference_start == pnext
+ and self.get_read_pair_num(bamread) != pair_num):
+ return bamread
return None
# Filters the donor reads to keep only reads that occur twice.
@@ -624,14 +886,44 @@ def filter_variant_reads(self, bamreads):
return [bread for bread in bamreads
if (self.read_occurence(bread.get_bam_read_id(), bamreads) == 2)]
- # Returns the number of occurences of a certain read in the list of
- # BAM reads (should be two ideally).
def read_occurence(self, readid, readlist):
+ """Checks and returns the occurence of a specified read.
+
+ Parameters
+ ----------
+ readid : str
+ Identifier of the read to check
+ readlist : list of DonorBamRead
+ Set of reads to check occurence of read in
+
+ Returns
+ -------
+ int
+ Occurences of the specified read
+ """
return sum([bamread.get_bam_read_id() == readid for bamread in readlist])
- # Determines the start and stops of the variant context (please see
- # the documentation for more information).
def determine_context(self, contextreads, contextorigin, contextchr):
+ """Determines and returns an acceptor/donor context.
+
+ The acceptor/donor context is determined from a set of reads overlapping with the variant including their read
+ mates. The leftmost and rightmost genomic positions of all reads are collected and filtered for outliers.
+ The context start (leftmost genomic) position is then determined by taking minimum and maximum leftmost and
+ rightmost position respectively.
+
+ Parameters
+ ----------
+ contextreads: list of DonorBamRead
+ contextorigin : int
+ Variant genomic position the context will be based on
+ contextchr : str
+ Chromosomae name the context is located on
+
+ Returns
+ -------
+ list of str and int
+ Essential context data (chromosome, variant pos, start, end)
+ """
# Check whether there are reads to determine the context for.
if not contextreads:
return []
@@ -644,7 +936,7 @@ def determine_context(self, contextreads, contextorigin, contextchr):
stops = [conread.get_bam_read_ref_end()
for conread in contextreads
- if conread.get_bam_read_chrom() == contextchr]
+ if conread.get_bam_read_chrom() == contextchr and conread.get_bam_read_ref_end() is not None]
# Number of mates filtered for mapping to different chr.
num_diff_chr = len(contextreads) - len(starts)
@@ -666,15 +958,23 @@ def determine_context(self, contextreads, contextorigin, contextchr):
# Set variant context as chr, min start, max end.
contextstart = min(filtered_starts)
contextend = max(filtered_stops)
- self.vaselogger.debug("Context is "
- + str(contextchr) + ", "
- + str(contextstart) + ", "
- + str(contextend))
return [contextchr, contextorigin, contextstart, contextend]
- # Filters outliers from a list of start/stop positions using
- # Tukey's Fences method.
def filter_outliers(self, pos_list, k=3):
+ """Filters outliers from a list of start or stop positions and returns the filtered list.
+
+ Outlier start/stop positions are filtered from the list using Tukey's Fences method. For more info please see
+ https://en.wikipedia.org/wiki/Outlier#Tukey's_fences
+
+ Parameters
+ ----------
+ pos_list : list of int
+ Start/stop positions
+ k : int
+ Factor to determine outlier
+ Returns
+ -------
+ """
# First and third quartile values of the positions.
q1 = np.percentile(pos_list, 25)
q3 = np.percentile(pos_list, 75)
@@ -686,10 +986,24 @@ def filter_outliers(self, pos_list, k=3):
if ((q1 - (k * iq)) <= x <= (q3 + (k * iq)))]
return filtered
- # Determines the size of the variant context based on both the
- # acceptor and donor reads.
def determine_largest_context(self, contextorigin, acceptor_context,
donor_context):
+ """Determines the size of the variant context based on both the acceptor and donor reads.
+
+ Parameters
+ ----------
+ contextorigin : int
+ Variant position that the context is based on
+ acceptor_context : list of str and int
+ Essential acceptor context data
+ donor_context : list of str and int
+ Essential donor context data
+
+ Returns
+ -------
+ list of str and int
+ Context (chromosome, variant pos, start, end)
+ """
largest_context = [donor_context[0]]
largest_context.append(contextorigin)
# Determine and save the smallest context start.
@@ -767,9 +1081,20 @@ def write_vase_fastq(self, acceptor_infq, fastq_outpath,
exit()
def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath):
+ """Build and writes a fastq file containing only donor reads.
+
+ Parameters
+ ----------
+ donorbamreaddata : list of tuple
+ Donor reads to write to fastq file
+ fr : str
+ Write forward ('1') or reverse ('2') fastq file
+ fastq_outpath : str
+ Path and name to write donor fastq file to
+ """
# Write the new VaSe FastQ file.
- vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1)
- fqgz_outfile = io.BufferedWriter(open(vasefq_outname, "wb"))
+ # vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1)
+ fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb"))
donorbamreaddata.sort(key=lambda dbr: dbr[0], reverse=False)
for bamread in donorbamreaddata:
@@ -783,29 +1108,72 @@ def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath):
fqgz_outfile.flush()
fqgz_outfile.close()
- # Checks if a read is read 1 (R1) or read 2 (R2).
def is_required_read(self, bamread, fr):
+ """Checks and returns whether the current read is the one that is required.
+
+ When writing the validation fastq files only R1, or forward (F), reads should go to the R1 validation fastq
+ file. Similarly, the R2 reads should only go into the R2 validation fastq file.
+
+ Parameters
+ ----------
+ bamread : DonorBamRead
+ Read to check
+ fr : str
+ Whether the read is required for R1 or R2
+
+ Returns
+ -------
+ bool
+ True if the read is required, False if not
+ """
if fr == "F":
return bamread.is_read1()
return bamread.is_read2()
- # Returns the name for the fastq out file.
def set_fastq_out_path(self, outpath, fr, lnum):
+ """Sets and returns the fastq output path and filename.
+
+ Parameters
+ ----------
+ outpath : str
+ Path and suffix to write fastq file to
+ fr: str
+ Will the fastq file be R1 ('1') or R2 ('2')
+ lnum: int
+ Lane number (i.e.: 1, 2, 3 or 4)
+
+ Returns
+ -------
+ str
+ Full path to write fastq file to
+ """
if fr == "1":
return f"{outpath}_{datetime.now().date()}_L{lnum}_R1.fastq"
return f"{outpath}_{datetime.now().date()}_L{lnum}_R2.fastq"
# ===METHODS TO OBTAIN SOME DATA OF THE VASEBUILDER OBJECT=================
- # Returns the identifier of the current VaSeBuilder object.
def get_creation_id(self):
+ """Returns the identifier of the current VaSeBuilder object.
+
+ Returns
+ -------
+ self.creation_id : str
+ VaSeBuilder creation identifier
+ """
return self.creation_id
# Returns the date the current VaSeBuilder object has been made.
# def get_creation_date(self):
- # return self.creation_date
+ # return self.creation_date
- # Returns the time the current VaSeBuilder object has been made.
def get_creation_time(self):
+ """Returns the date and time the current VaSeBuilder object has been made.
+
+ Returns
+ -------
+ str
+ VaSeBuilder creation date and time
+ """
return self.creation_time
# Returns all variant contexts.
@@ -845,123 +1213,156 @@ def get_searchchrom_prefix(self, bamfile):
return ""
# ===METHODS TO WRITE OUTPUT FILES=========================================
- # Writes the used donor vcf files to a file
def write_used_donor_files(self, outfileloc, filesamplemap,
- used_donor_files):
+ used_donor_files, vbuuid):
+ """Writes the donor alignment or variant files used in constructing variant contexts to an output file.
+
+ Parameters
+ ----------
+ outfileloc : str
+ Path to write the output file to
+ filesamplemap : dict
+ Donor files per sample
+ used_donor_files : list
+ Donor alignment or variant files used to construct varian contexts
+ """
try:
with open(outfileloc, "w") as outfile:
+ outfile.write(f"#VBUUID: {vbuuid}\n")
outfile.write("#SampleId\tDonorFile\n")
for sampleid, samplefile in filesamplemap.items():
- if (samplefile in used_donor_files):
+ if samplefile in used_donor_files:
outfile.write(f"{sampleid}\t{samplefile}\n")
except IOError as ioe:
self.vaselogger.critical("Could not write used donor files to "
f"{outfileloc}")
- # Writes optional debug level output files.
def write_optional_output_files(self, outpath, contextfile):
+ """Writes optional output files to a specified output location.
+
+ The optional output files contain data about acceptor and donor contexts as well as output files containing the
+ reads with unmapped mates per variant context. The optional output files are only produced when VaSeBuilder is
+ run with with debug set to True.
+
+ Parameters
+ ----------
+ outpath: str
+ Path to write the optional output file to
+ contextfile: VariantContextFile
+ Variant context data
+ """
# Write the optional acceptor context files; acceptor contexts,
# read ids with unmapped mate and left/right positions.
self.vaselogger.debug("Writing acceptor contexts to "
f"{outpath}acceptorcontexts.txt")
- contextfile.write_acceptor_context_file(f"{outpath}acceptorcontexts.txt")
+ contextfile.write_acceptor_context_file(f"{outpath}acceptorcontexts.txt", self.creation_id)
self.vaselogger.debug("Writing acceptor context statistics to "
f"{outpath}acceptorcontextstats.txt")
- contextfile.write_acceptor_context_stats(
- f"{outpath}acceptorcontextstats.txt"
- )
+ contextfile.write_acceptor_context_stats(f"{outpath}acceptorcontextstats.txt", self.creation_id)
self.vaselogger.debug("Writing acceptor context read identifiers with "
"unmapped mates to"
f"mates to {outpath}acceptor_unmapped.txt")
- contextfile.write_acceptor_unmapped_mates(
- f"{outpath}acceptor_unmapped.txt"
- )
+ contextfile.write_acceptor_unmapped_mates(f"{outpath}acceptor_unmapped.txt", self.creation_id)
self.vaselogger.debug("Writing left and right most read positions of "
"each acceptor context to "
f"{outpath}acceptor_positions.txt")
- contextfile.write_acceptor_left_right_positions(
- f"{outpath}acceptor_positions.txt"
- )
+ contextfile.write_acceptor_left_right_positions(f"{outpath}acceptor_positions.txt", self.creation_id)
- # Write the optional donor context files; donor contexts,
- # read ids with unmapped mate and left/right positions.
+ # Write the optional donor context files; donor contexts, read ids with unmapped mate and left/right positions.
self.vaselogger.debug("Writing donor contexts to "
f"{outpath}donorcontexts.txt")
- contextfile.write_donor_context_file(f"{outpath}donorcontexts.txt")
+ contextfile.write_donor_context_file(f"{outpath}donorcontexts.txt", self.creation_id)
self.vaselogger.debug("Writing donor context statistics to "
f"{outpath}donorcontextstats.txt")
- contextfile.write_donor_context_stats(f"{outpath}donorcontextstats.txt")
+ contextfile.write_donor_context_stats(f"{outpath}donorcontextstats.txt", self.creation_id)
self.vaselogger.debug("Writing donor context read identifiers "
- "with unmapped mates to "
- f"{outpath}donor_unmapped.txt")
- contextfile.write_donor_unmapped_mates(f"{outpath}donor_unmapped.txt")
+ f"with unmapped mates to {outpath}donor_unmapped.txt")
+ contextfile.write_donor_unmapped_mates(f"{outpath}donor_unmapped.txt", self.creation_id)
self.vaselogger.debug("Writing left and right most read positions "
- "of each donor context to "
- f"{outpath}donor_positions.txt")
- contextfile.write_donor_left_right_positions(
- f"{outpath}donor_positions.txt"
- )
-
- # Write the optional variant context files; acceptor & donor
- # unmapped mates and left/right positions.
+ f"of each donor context to {outpath}donor_positions.txt")
+ contextfile.write_donor_left_right_positions(f"{outpath}donor_positions.txt", self.creation_id)
+
+ # Write the optional variant context files; acceptor & donor unmapped mates and left/right positions.
self.vaselogger.debug("Writing variant context acceptor read "
- "identifiers with unmapped mates to "
- f"{outpath}varcon_unmapped_acceptor.txt")
- contextfile.write_reads_with_unmapped_mate(
- "acceptor",
- f"{outpath}varcon_unmapped_acceptor.txt"
- )
+ f"identifiers with unmapped mates to {outpath}varcon_unmapped_acceptor.txt")
+ contextfile.write_reads_with_unmapped_mate("acceptor", f"{outpath}varcon_unmapped_acceptor.txt",
+ self.creation_id)
self.vaselogger.debug("Writing variant context donor read identifiers "
- "with unmapped mates to "
- f"{outpath}varcon_unmapped_donor.txt")
- contextfile.write_reads_with_unmapped_mate(
- "donor",
- f"{outpath}varcon_unmapped_donor.txt"
- )
+ f"with unmapped mates to {outpath}varcon_unmapped_donor.txt")
+ contextfile.write_reads_with_unmapped_mate("donor", f"{outpath}varcon_unmapped_donor.txt", self.creation_id)
self.vaselogger.debug("Writing variant context left and right most "
- "read positions of acceptor reads to "
- f"{outpath}varcon_positions_acceptor.txt")
+ f"read positions of acceptor reads to {outpath}varcon_positions_acceptor.txt")
contextfile.write_left_right_positions(
- "acceptor",
- f"{outpath}varcon_positions_acceptor.txt"
- )
+ "acceptor",
+ f"{outpath}varcon_positions_acceptor.txt", self.creation_id
+ )
self.vaselogger.debug("Writing variant context left and right most "
"read positions of donor reads to "
f"{outpath}varcon_positions_donor.txt")
contextfile.write_left_right_positions(
- "donor",
- f"{outpath}varcon_positions_donor.txt"
- )
-
- # Writes a BED file for the variant context data
- def write_bed_file(self, variantcontextdata, bedoutloc):
+ "donor",
+ f"{outpath}varcon_positions_donor.txt", self.creation_id
+ )
+
+ def write_bed_file(self, variantcontextdata, bedoutloc, vbuuid):
+ """Writes variant contexts as a BED file
+
+ Parameters
+ ----------
+ variantcontextdata : list of VariantContext
+ Variants contexts to write to BED file
+ bedoutloc : str
+ Path to write the BED file to
+ """
try:
with open(bedoutloc, "w") as bedoutfile:
+ bedoutfile.write(f"#VBUUID: {vbuuid}\n")
for varcon in variantcontextdata:
bedoutfile.write(f"{varcon.get_variant_context_chrom()}\t{varcon.get_variant_context_start()}\t"
f"{varcon.get_variant_context_end()}\t{varcon.get_variant_context_id()}\n")
except IOError:
self.vaselogger.warning(f"Could not write variant context data to BED file: {bedoutloc}")
- # Checks that thet sequence names are the same
def check_sequence_names(self, referencefile, alignmentfile):
+ """Checks and returns whether the chromosome names in the genome reference and alignment file are the same.
+
+ Parameters
+ ----------
+ referencefile:
+ alignmentfile:
+
+ Returns
+ -------
+
+ """
reference_seqnames = self.get_reference_sequence_names(referencefile)
alignment_seqnames = self.vb_scanner.get_alignment_sequence_names(alignmentfile)
shared_seqnames = reference_seqnames & alignment_seqnames
if len(shared_seqnames) < len(reference_seqnames) or len(shared_seqnames) < len(alignment_seqnames):
self.vaselogger.warning("Reference file and alignment file do not contain the same sequence names")
- # Returns the sequence names from the reference genome fasta file
def get_reference_sequence_names(self, reference_fileloc):
+ """Returns the sequence names from the reference genome fasta file
+
+ Parameters
+ ----------
+ reference_fileloc: str
+ Path to genomic reference fasta file
+
+ Returns
+ -------
+ reference_seqnames : list of str
+ Extracted chromosome names
+ """
reference_seqnames = set()
try:
with open(reference_fileloc, "r") as reference_file:
@@ -974,8 +1375,28 @@ def get_reference_sequence_names(self, reference_fileloc):
exit()
return reference_seqnames
- # Adds the donor fastq file while making sure each read is only added once
def add_donor_fastq3(self, opened_outfile, donorfastqfile, donoridlist):
+ """Adds a donor fastq file to an already opened validation fastq file.
+
+ Reads from a specified donor fastq file are added to an opened validation fastq file if the reads had not been
+ added to a validation fastq file. If already added (the read identifier is in the set of donor read ids), the
+ donor read is skipped. After adding a donor read is written to the validation fastq file, the read identifier is
+ added to the donor id list, ensuring every read is only added once.
+
+ Parameters
+ ----------
+ opened_outfile : File
+ Already opened validation fastq file to write data to
+ donorfastqfile : str
+ Path to donor fastq file to add
+ donoridlist : set of str
+ Set of donor read identifiers already added to the validation fastq files
+
+ Returns
+ -------
+ donoridlist : set of str
+ Updated set of donor read identifiers already added to validation set
+ """
try:
with io.BufferedReader(gzip.open(donorfastqfile, "rb")) as donorfastq:
for fileline in donorfastq:
@@ -1000,16 +1421,42 @@ def add_donor_fastq3(self, opened_outfile, donorfastqfile, donoridlist):
finally:
return donoridlist
- # Builds a validation set using existing donor fastq files
def build_validation_from_donor_fastqs(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath):
+ """Builds a validation set using existing donor fastq files.
+
+ Parameters
+ ----------
+ afq1_in : list of str
+ R1 acceptor fastq files to use as template
+ afq2_in : list of str
+ R2 acceptor fastq files to use as template
+ dfq1_in : list of str
+ R1 donor fsatq files to add
+ dfq2_in : list of str
+ R2 donor fastq files to add
+ varconfileloc : str
+ Path to existing variant context file to build validation set
+ outpath :str
+ """
varconfile = VariantContextFile(varconfileloc)
skip_list = varconfile.get_all_variant_context_acceptor_read_ids()
skip_list.sort()
for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]):
self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath)
- # Splits a list of donor fastq files over a number of acceptors
def divide_donorfastqs_over_acceptors(self, list_of_donorfqs, num_of_acceptors):
+ """Divides a list of donor fastq files over a number of acceptor fastq files.
+
+ Donor fastq files are divided equally into a specified number of groups. The number of groups is specified by
+ the number of acceptor fastq files to use as template.
+
+ Parameters
+ ----------
+ list_of_donorfqs: list of str
+ Paths to donor fastq files
+ num_of_acceptors: int
+ Number of template fastq files
+ """
cycler = 0
split_donors = [[] for x in range(num_of_acceptors)]
copy_list_donorfqs = list(list_of_donorfqs)
@@ -1019,15 +1466,36 @@ def divide_donorfastqs_over_acceptors(self, list_of_donorfqs, num_of_acceptors):
cycler += 1
return split_donors
- # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING
+ # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING DONOR FASTQS
def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toexclude, forward_reverse, outpath):
+ """Builds a set of R1/R2 validation fastq files using already existing donor fastq files.
+
+ The provided acceptor fastq files are filtered and used as template for the valdiation fastq files. Already
+ existing fastq files are added.
+
+ Parameters
+ ----------
+ acceptor_fqin: str
+ Path to acceptor fastq file to use as template
+ donor_fqin: list of str
+ Paths to donor fastq files to add
+ acceptor_reads_toexclude: liust of str
+ Acceptor reads to exclude from validation set
+ forward_reverse: str
+ Write R1 or R2
+ outpath: str
+ Path to write the validation fastq file to
+ """
donor_sets = self.divide_donorfastqs_over_acceptors(donor_fqin, len(acceptor_fqin))
donorids = set()
+
+ self.vaselogger.info(f"Start building validation R{forward_reverse} fastq files")
for x in range(len(acceptor_fqin)):
fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x+1)
fqoutfile = io.BufferedWriter(open(fqoutname, "wb"))
# Start writing the filtered acceptor file to a new output file
+ self.vaselogger.info(f"Start building fastq file {fqoutname}")
try:
acceptorfastq = io.BufferedReader(gzip.open(acceptor_fqin[x], "rb"))
for fileline in acceptorfastq:
@@ -1050,7 +1518,1050 @@ def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toe
except IOError:
self.vaselogger.critical(f"Acceptor fastq {acceptor_fqin[x]} could not be opened")
- # Add the donor fastq files
+ # Add the donor fastq files.
+ self.vaselogger.info(f"Add {len(donor_sets[x])} donor fastq files to {fqoutname}")
for donorfastqfile in donor_sets[x]:
+ self.vaselogger.debug(f"Adding {donorfastqfile} to {fqoutname}")
self.add_donor_fastq3(fqoutfile, donorfastqfile, donorids)
fqoutfile.close()
+
+ # A-mode: Filters acceptors and adds already existing donor fastq files
+ def run_a_mode(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath):
+ self.vaselogger.info("Running VaeBuilder A-mode")
+ varconfile = VariantContextFile(varconfileloc)
+ skip_list = varconfile.get_all_variant_context_acceptor_read_ids()
+ skip_list.sort()
+ for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]):
+ self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath)
+
+ def run_ac_mode(self, afq1_in, afq2_in, dfqs, varconfile, outpath):
+ """Runs VaSeBuilder AC-mode.
+
+ This run mode builds a set of validation fastq files by adding already existing fastq files to acceptor fastq
+ files used as template. The variant context file used to construct the variant contexts of the donor fastq files
+ is required to filter out the acceptor reads.to be replaced with the donor data.
+
+ Parameters
+ ----------
+ afq1_in : list of str
+ R1 acceptor fastq files to use as template
+ afq2_in : list of str
+ R2 acceptor fastq files to use as template
+ dfqs : list of list of str
+ R1 and R2 donor fastq files to add
+ varconfile: VariantContextFile
+ Variant context to use for filtering out acceptor reads
+ outpath: str
+ Path to folder to write the output to
+ """
+ self.vaselogger.info("Running VaSeBuilder A-mode")
+ # Split the donor fastqs into an R1 and R2 group
+ r1_dfqs = [dfq[0] for dfq in dfqs]
+ r2_dfqs = [dfq[1] for dfq in dfqs]
+
+ skip_list = varconfile.get_all_variant_context_acceptor_read_ids()
+ skip_list.sort()
+
+ for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [r1_dfqs, r2_dfqs]):
+ self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath)
+
+ def run_d_mode(self, variantcontextfile, fq_out):
+ """Runs VaSeBuilder D-mode.
+
+ This run mode produces one R1 and R2 file with donor reads of all variant contexts. This mode does not produce
+ validation fastq files.
+
+ Parameters
+ ----------
+ variantcontextfile : VariantContextFile
+ Variants context to use
+ fq_out : str
+ Path and suffix to write donor fastq files to
+ """
+ self.vaselogger.info("Running VaSeBuilder D-mode")
+ # Combine all donor reads from all variant contexts
+ add_list = variantcontextfile.get_all_variant_context_donor_reads()
+ self.vaselogger.info("Writing FastQ files.")
+ # Write the donor fastq files.
+ for i in ["1", "2"]:
+ self.vaselogger.info(f"Start writing the R{i} FastQ files.")
+ fq_starttime = time.time()
+ fqoutpath = self.set_fastq_out_path(fq_out, i, 1)
+ self.build_donor_fq(add_list, i, fqoutpath)
+ self.vaselogger.info(f"Wrote all R{i} FastQ files.")
+ self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.")
+ self.vaselogger.info("Finished writing donor FastQ files.")
+
+ def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out, random_seed):
+ """Runs VaSeBuilder F-mode.
+
+ This run mode creates a full set of validation fastq files from established variant contexts and a set of
+ acceptor fastq files to use as template to create the validation fastq files.
+
+ Parameters
+ ----------
+ variantcontextfile : VariantContextFile
+ Variant contexts to use
+ fq1_in : list of str
+ R1 fastq files to use as template
+ fq2_in : list of str
+ R2 fastq files to use as template
+ fq_out : list of str
+ Path and suffix to write validation fastq files to
+ """
+ self.vaselogger.info("Running VaSeBuilder F-mode")
+
+ # Combine all donor reads from all variant contexts
+ add_list = variantcontextfile.get_all_variant_context_donor_reads()
+ self.vaselogger.info("Writing FastQ files.")
+ skip_list = set(variantcontextfile.get_all_variant_context_acceptor_read_ids())
+ # skip_list = list(set(variantcontextfile.get_all_variant_context_acceptor_read_ids()))
+
+ donor_read_add_data = {}
+ for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]):
+ # Write the fastq files.
+ self.vaselogger.info(f"Start writing the R{i} FastQ files.")
+ fq_starttime = time.time()
+ self.build_fastq_v2(fq_i, skip_list, add_list, i, fq_out, random_seed, donor_read_add_data)
+ self.vaselogger.info(f"Wrote all R{i} FastQ files.")
+ self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.")
+ self.vaselogger.info("Finished writing FastQ files.")
+ self.write_donor_insert_positions_v2(donor_read_add_data, f"{fq_out}_donor_read_insert_positions.txt")
+
+ def run_p_mode(self, variantcontextfile, outpath, fq_out):
+ """Runs VaSeBuilder P-mode.
+
+ This run mode produces an R1 and R2 fastq file with donor reads for each variant context. This mode does not
+ create or write validation fastq files.
+
+ Parameters
+ ----------
+ variantcontextfile : VariantContextFile
+ Established variant contexts
+ outpath : str
+ Path to folder to write output files to
+ fq_out : str
+ Path and suffix to write fastq out files to
+ """
+ context_fq_links = {}
+
+ self.vaselogger.info("Running VaSeBuilder P-mode")
+ self.vaselogger.info("Begin writing variant FastQ files.")
+ variantcontexts = variantcontextfile.get_variant_contexts()
+ for context in variantcontexts:
+ add_list = context.get_donor_read_strings()
+ self.vaselogger.debug(f"Writing variant FastQs for variant {context.context_id}.")
+
+ r1_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "1", 1)
+ r2_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "2", 1)
+ self.build_donor_fq(add_list, "1", r1_donorfq)
+ self.build_donor_fq(add_list, "2", r2_donorfq)
+ context_fq_links[context.context_id] = [r1_donorfq, r2_donorfq]
+ self.vaselogger.info("Finished writing variant FastQ files.")
+
+ # Write the P-mode link file (links context to fastq files for the current run)
+ self.write_pmode_linkfile(outpath, context_fq_links)
+
+ def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomereference, outdir, varconout,
+ variantlist):
+ """Runs VaSeBuilder X-mode.
+
+ This run mode only produces the variant contexts and writes the associated output files. This mode does not
+ create or write validation fastq files.
+
+ Parameters
+ ----------
+ sampleidlist : list of str
+ Names/identifiers of samples to process
+ donorvcfs : dict
+ Donor variant files per sample
+ donorbams : dict
+ Donor alignment files per sample
+ acceptorbam : str
+ Path to alignment file to use as acceptor
+ genomereference : str
+ Path to genome reference fasta file
+ outdir : str
+ Path to write output files to
+ varconout : str
+ Path to folder to write the variant context file to
+ variantlist : dict
+ Variants to process per sample
+ """
+ self.vaselogger.info("Running VaSeBuilder X-mode")
+ self.build_varcon_set(sampleidlist, donorvcfs, donorbams, acceptorbam, outdir, genomereference, varconout,
+ variantlist)
+
+ # =====SPLITTING THE BUILD_VARCON_SET() INTO MULTIPLE SMALLER METHODS=====
+ def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath,
+ variantlist):
+ """Builds and returns a variant context file.
+
+ The variant context file is build from the provided variants and acceptor and donors alignment files.
+
+ Parameters
+ ----------
+ sampleidlist : list of str
+ Sample name/identifier
+ vcfsamplemap : dict
+ Variant files per sample
+ bamsamplemap : dict
+ Alignment files per sample
+ acceptorbamloc : str
+ Path to alignment file to use as acceptor
+ outpath : str
+ Path to folder to write output files to
+ reference_loc : str
+ Path to the genomic reference fasta file
+ varcon_outpath : str
+ Path and name to write variant context file to
+ variantlist : dict
+ Variants to use per sample
+
+ Returns
+ -------
+ variantcontexts : VariantContextFile
+ Established variant contexts
+ """
+ donor_vcfs_used = []
+ donor_bams_used = []
+ variantcontexts = VariantContextFile()
+
+ try:
+ acceptorbamfile = pysam.AlignmentFile(acceptorbamloc, reference_filename=reference_loc)
+ except IOError:
+ self.vaselogger.critical("Could not open Acceptor BAM/CRAM")
+ exit()
+
+ # Start iterating over the samples
+ for sampleid in sampleidlist:
+ self.vaselogger.debug(f"Start processing sample {sampleid}")
+ sample_variant_filter = self.bvcs_set_variant_filter(sampleid, variantlist)
+ samplevariants = self.get_sample_vcf_variants(vcfsamplemap[sampleid], sample_variant_filter)
+
+ if not samplevariants:
+ self.vaselogger.warning(f"No variants obtained for sample {sampleid}. Skipping sample")
+ continue
+
+ # Call the method that will process the sample
+ self.bvcs_process_sample(sampleid, variantcontexts, acceptorbamfile, bamsamplemap[sampleid],
+ reference_loc, samplevariants)
+
+ # Add the used donor VCF and BAM to the lists of used VCF and BAM files
+ donor_bams_used.append(vcfsamplemap[sampleid])
+ donor_vcfs_used.append(bamsamplemap[sampleid])
+
+ # Check if there are no variant contexts.
+ if variantcontexts.get_number_of_contexts() <= 0:
+ self.vaselogger.info("No variant contexts were created. No output files will be written.")
+ return None
+
+ # Write the output variant context output data
+ self.bvcs_write_output_files(outpath, varcon_outpath, variantcontexts, vcfsamplemap, bamsamplemap,
+ donor_vcfs_used, donor_bams_used)
+
+ # Checks whether the program is running on debug and, if so, write some extra output files.
+ if self.vaselogger.getEffectiveLevel() == 10:
+ self.write_optional_output_files(outpath, variantcontexts)
+ return variantcontexts
+
+ def bvcs_process_sample(self, sampleid, variantcontextfile, abamfile, dbamfileloc, referenceloc, samplevariants):
+ """Processes a sample and adds variant contexts to a variant context file.
+
+ Parameters
+ ----------
+ sampleid : str
+ Sample name/identifier
+ variantcontextfile : VariantContextFile
+ Variant context file that saves variant contexts
+ abamfile: pysam.AlignmentFile
+ Already opened pysam AlignmentFile to use as acceptor
+ dbamfileloc: str
+ Path to the alignment file to use as donor
+ referenceloc: str
+ Path to the genomic reference fasta file
+ samplevariants : list of VcfVariants
+ Variants to process for the specified sample
+ """
+ try:
+ donorbamfile = pysam.AlignmentFile(dbamfileloc, reference_filename=referenceloc)
+ except IOError:
+ self.vaselogger.warning(f"Could not open {dbamfileloc} ; Skipping {sampleid}")
+ return
+
+ # Iterate over the sample variants
+ for samplevariant in samplevariants:
+ variantcontext = self.bvcs_process_variant(sampleid, variantcontextfile, samplevariant, abamfile,
+ donorbamfile, True)
+ if not variantcontext:
+ self.vaselogger.info(f"Could not establish variant context ; Skipping.")
+ continue
+
+ # If this widest context overlaps an existing variant context, skip it.
+ if self.contexts.context_collision(variantcontext.get_context()):
+ self.vaselogger.debug(f"Variant context {variantcontext.get_variant_context_id()} overlaps with an"
+ f"already existing variant context; Skipping.")
+ continue
+ variantcontextfile.add_existing_variant_context(variantcontext.get_variant_context_id(), variantcontext)
+
+ # Processes a sample variant by establishing the contexts.
+ def bvcs_process_variant(self, sampleid, variantcontextfile, samplevariant, abamfile, dbamfile, write_unm=False):
+ """Processes a variant and returns the established variant context.
+
+ Parameters
+ ----------
+ sampleid : str
+ Sample name/identifier
+ variantcontextfile: VariantContextFile
+ Variant context file that saves the variant contexts
+ samplevariant : VcfVariant
+ Variant to process and for which to establish an accpetor, donor and variant context
+ abamfile : pysam.AlignmentFile
+ Already opened pysam AlignmentFile to use as acceptor
+ dbamfile : pysam.AlignmentFile
+ Already opened pysam AlignmentFile to use as donor
+ write_unm : bool
+ Whether to save read identifiers with unmapped read mates
+
+ Returns
+ -------
+ vcontext : VariantContext or None
+ The established variant context, None if context could not be established
+ """
+ # Establish the donor context
+ variantid = samplevariant.get_variant_id()
+ variantchrom = samplevariant.get_variant_chrom()
+ variantpos = samplevariant.get_variant_pos()
+ varianttype = samplevariant.get_variant_type()
+
+ self.vaselogger.debug(f"Processing variant {variantid}.")
+ self.debug_msg("vw", variantid)
+ self.vaselogger.debug(f"Variant {variantid} determined to be {varianttype}")
+ searchwindow = self.determine_read_search_window(varianttype, samplevariant)
+ self.vaselogger.debug(f"Search window determined to be {variantchrom}:{searchwindow[0]+1}-{searchwindow[1]}")
+
+ # Check whether the variant overlaps with an existing variant context
+ if variantcontextfile.variant_is_in_context(varianttype, variantchrom, *searchwindow):
+ self.vaselogger.debug(f"Variant {variantid} is located in an existing context.")
+ return None
+
+ # Determine the donor context.
+ self.debug_msg("dc", variantid)
+ t0 = time.time()
+ dcontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, dbamfile,
+ write_unm)
+ if not dcontext:
+ self.vaselogger.info(f"Could not establish donor context ; Skipping variant {variantid}")
+ return None
+ self.debug_msg("dc", variantid, t0)
+ self.vaselogger.debug(f"Donor context determined to be {dcontext.get_context_chrom()}:"
+ f"{dcontext.get_context_start()}-{dcontext.get_context_end()}")
+
+ # Determine the acceptor context.
+ self.debug_msg("ac", variantid)
+ t0 = time.time()
+ acontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, abamfile,
+ write_unm, dcontext.get_context())
+ self.debug_msg("ac", variantid, t0)
+ self.vaselogger.debug(f"Acceptor context determined to be {acontext.get_context_chrom()}:"
+ f"{acontext.get_context_start()}-{acontext.get_context_end()}")
+
+ # Determin the variant context.
+ self.debug_msg("cc", variantid)
+ t0 = time.time()
+ vcontext = self.bvcs_establish_variant_context(sampleid, variantcontextfile, variantid, variantchrom,
+ variantpos, acontext, dcontext, abamfile, dbamfile, write_unm)
+ self.debug_msg("cc", variantid, t0)
+ self.vaselogger.debug(f"Combined context determined to be {vcontext.get_variant_context_chrom()}:"
+ f"{vcontext.get_variant_context_start()}-{vcontext.get_variant_context_end()}")
+ return vcontext
+
+ # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again.
+ def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid, variantchrom, variantpos,
+ acontext, dcontext, abamfile, dbamfile, write_unm=False):
+ """Establishes and returns a variant context.
+
+ The variant context window is established based on the acceptor and donor contexts. Reads and mates overlapping
+ with the variant context are also fetched and added to the variant context.
+
+ Parameters
+ ----------
+ sampleid : str
+ Sample name/identifier
+ variantcontextfile : VariantContextFile
+ Variant context file that saves variant contexts
+ variantid : str
+ Variant identifier
+ variantchrom : str
+ Chromosome the
+ variantpos: int
+ Leftmost genomic position of the variant
+ acontext : OverlapContext
+ Established acceptor context
+ dcontext : OverlapContext
+ Established donor context
+ abamfile : pysam.AlignmentFile
+ Already opened pysam AlignmentFile used as acceptor
+ dbamfile : pysam.AlignmentFile
+ Already opened pysam AlignmentFile used as donor
+ write_unm : bool
+ Whether to save identifiers of reads with unmapped mates
+
+ Returns
+ -------
+ variant_context : VariantContext
+ Established variant context with all associated data
+ """
+ unmapped_alist = []
+ unmapped_dlist = []
+
+ # Determine the variant context from the acceptor and donor context
+ vcontext_window = self.determine_largest_context(variantpos, acontext.get_context(), dcontext.get_context())
+ if variantcontextfile.context_collision(vcontext_window):
+ self.vaselogger.debug(f"Variant context {variantid} overlaps with an already existing context; Skipping.")
+ return None
+
+ # Gather variant context donor reads.
+ self.debug_msg("cdr", variantid)
+ t0 = time.time()
+ vcontext_dreads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3],
+ dbamfile, write_unm, unmapped_dlist)
+ self.debug_msg("cdr", variantid, t0)
+
+ # Gather variant context acceptor reads.
+ self.debug_msg("car", variantid)
+ t0 = time.time()
+ vcontext_areads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3],
+ abamfile, write_unm, unmapped_alist)
+ self.debug_msg("car", variantid, t0)
+
+ variant_context = VariantContext(variantid, sampleid, *vcontext_window, vcontext_areads, vcontext_dreads,
+ acontext, dcontext)
+
+ # Set the variant context unmapped read ids
+ variant_context.set_unmapped_acceptor_mate_ids(unmapped_alist)
+ variant_context.set_unmapped_donor_mate_ids(unmapped_dlist)
+ return variant_context
+
+ # Establishes an acceptor/donor context by fetching reads (and their mates) overlapping directly with the variant
+ def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, searchwindow, bamfile,
+ write_unm=False,
+ fallback_window=None):
+ """Establishes and returns an acceptor/donor context.
+
+ The context window is established from fetched reads and their mates overlapping with the variant.
+
+ Parameters
+ ----------
+ sampleid : str
+ Sample name/identifier
+ variantid : str
+ Variant indentifier
+ variantchrom : str
+ Chromosome name the variant is located on
+ variantpos : int
+ Leftmost genomic position of the variant
+ searchwindow: list of int
+ Sreach window to use for fetching reads
+ bamfile : pysam.AlignmentFile
+ Already opened pysam AlignmentFile
+ write_unm: bool
+ Save the read identifiers with unmapped mates
+ fallback_window : list
+ Window to set if no window could be set/determined
+
+ Returns
+ -------
+ OverlapContext or None
+ Acceptor/donor context if context is established, None if not
+ """
+ unmappedlist = []
+
+ # Fetch context reads and establish the context window
+ self.vaselogger.debug("Fetching reads.")
+ t0 = time.time()
+ context_reads = self.get_variant_reads(variantid, variantchrom, *searchwindow, bamfile, write_unm, unmappedlist)
+ self.vaselogger.debug(f"Fetching reads took {time.time() - t0} seconds")
+
+ if not context_reads:
+ self.vaselogger.debug("No reads were found.")
+ context_window = self.determine_context(context_reads, variantpos, variantchrom)
+
+ # Check whether the context_window is valid and, if not, whether a fallback window has been set
+ if not context_window:
+ self.vaselogger.debug("Context window could not be determined.")
+ if fallback_window:
+ context_window = fallback_window
+ self.vaselogger.debug("Context window is set to the fall back window.")
+ else:
+ return None
+
+ # Create the context object containing all context data
+ adcontext = OverlapContext(variantid, sampleid, *context_window, context_reads)
+ adcontext.set_unmapped_mate_ids(unmappedlist)
+ return adcontext
+
+ # Sets the variant list for a sample if applicable, otherwise return None
+ def bvcs_set_variant_filter(self, sampleid, variant_list):
+ """Sets the variant list for a sample if applicable, otherwise return None
+
+ Parameters
+ ----------
+ sampleid : str
+ Sample name/identifier
+ variant_list : dict
+ Variant tuples with chromosome name and genomic position per sample
+
+ Returns
+ -------
+ list of tuple
+ Variants tuples with chromosome name and position
+ """
+ sample_variant_filter = None
+ if variant_list is not None:
+ if sampleid in variant_list:
+ sample_variant_filter = variant_list[sampleid]
+ else:
+ sample_variant_filter = []
+ return sample_variant_filter
+
+ def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, vcfsamplemap, bamsamplemap,
+ used_donor_vcfs, used_donor_bams):
+ """Writes VaSeBuilder output files.
+
+ These output files are the standard output files when building a variant context file. Output files include
+ the variant context file, variant context statistics file, variant context BED file, used donor variant files
+ and used donor alignment files.
+
+ Parameters
+ ----------
+ outpath: str
+ Path to folder to write the output files to
+ varcon_outpath : str
+ Path to write variant context file to
+ variantcontextfile : VariantContextFile
+ Variant contexts to write to file
+ vcfsamplemap : dict
+ Variant files per sample
+ bamsamplemap : dict
+ Alignment files per sample
+ used_donor_vcfs : list of str
+ Donor variant files used to create the variant contexts
+ used_donor_bams : list of str
+ Donor alignment files used to create the variant contexts
+ """
+ # Write the variant context file itself
+ self.vaselogger.info(f"Writing variant contexts to {varcon_outpath}")
+ variantcontextfile.write_variant_context_file(varcon_outpath, self.creation_id)
+
+ # Write the variant context statistics file
+ self.vaselogger.info(f"Writing variant context statistics to {outpath}varconstats.txt")
+ variantcontextfile.write_variant_context_stats(f"{outpath}varconstats.txt", self.creation_id)
+
+ # Write the variant contexts as a BED file
+ self.vaselogger.info(f"Writing the variant contexts to a BED file")
+ self.write_bed_file(variantcontextfile.get_variant_contexts(), f"{outpath}variantcontexts.bed",
+ self.creation_id)
+
+ # Write a listfile of the used variant (VCF/BCF) files
+ self.vaselogger.info(f"Writing the used donor variant files per sample to {outpath}donor_variant_files.txt")
+ self.write_used_donor_files(f"{outpath}donorvcfs.txt", vcfsamplemap, used_donor_vcfs, self.creation_id)
+
+ # Write a listfile of the used alignment (BAM/CRAM) files
+ self.vaselogger.info(f"Writing the used donor alignment files per sample to {outpath}donor_alignment_files.txt")
+ self.write_used_donor_files(f"{outpath}donor_alignment_files.txt", bamsamplemap, used_donor_bams,
+ self.creation_id)
+
+ def write_pmode_linkfile(self, outpath, context_fqs_links):
+ """Writes the link from variant context identifier to fastq output files for P-mode.
+
+ This output file is meant to link a VaSeBuilder run with fastq output files.
+
+ Parameters
+ ----------
+ outpath : str
+ Path to output folder to write the link file to.
+ context_fqs_links : dict of str: list
+ Fastq files per variant context identifier
+ """
+ plinkloc = f"{outpath}plink_{self.creation_id}.txt"
+ try:
+ with open(plinkloc, "w") as plinkfile:
+ plinkfile.write(f"#VBUUID: {self.creation_id}\n")
+ for contextid, fqfiles in context_fqs_links.items():
+ plinkfile.write(f"{contextid}\t{fqfiles[0]}\t{fqfiles[1]}\n")
+ except IOError:
+ self.vaselogger.warning(f"Could not write P-mode link file {plinkloc} for run {self.creation_id}")
+
+ def shuffle_donor_read_identifiers(self, donorreads, s=2):
+ """Shuffles and returns a list of donor read identifiers.
+
+ Prior to shuffling the donor read identifiers, the provided donor read list is copied to preserve the original.
+ The new list is then sorted and a seed is set to ensure that the shuffling can be reproduced if hte same data
+ is used.
+
+ Parameters
+ ----------
+ donorreads :
+ Donor read identifiers to shuffle
+ s: int
+ Seed to set for shuffling (default = 2)
+
+ Returns
+ -------
+ shuffled_donor_reads : list of str
+ Shuffled donor read identifiers
+ """
+ shuffled_donor_reads = donorreads.copy()
+ shuffled_donor_reads.sort()
+
+ random.seed(s)
+ random.shuffle(shuffled_donor_reads)
+ return shuffled_donor_reads
+
+ def shuffle_donor_add_positions(self, num_of_template_reads, num_of_donor_reads, s=2):
+ """Shuffle positions to
+
+ Parameters
+ ----------
+ num_of_template_reads : int
+ Number of template reads in the acceptor
+ num_of_donor_reads : int
+ Number of donor reads to be added
+ s : int
+ Seed to set for shuffling (default = 2)
+
+ Returns
+ -------
+ add_positions : list of int
+ Shuffled positions in fastq file to add donor reads to
+ """
+ random.seed(s)
+ self.vaselogger.debug(f"Semi random donor add positions seed set to {s}")
+ add_positions = []
+
+ # Check whether the number of donor reads to add exceeds the number of acceptor reads in the template.
+ if num_of_donor_reads > num_of_template_reads:
+ self.vaselogger.debug("Found more donor reads to add than")
+ random_sample_size = num_of_donor_reads
+ while random_sample_size > 0:
+ pos_to_add = []
+ if random_sample_size >= num_of_template_reads:
+ pos_to_add = random.sample(range(0, num_of_template_reads), num_of_template_reads)
+ else:
+ pos_to_add = random.sample(range(0, ), random_sample_size)
+ add_positions.extend(pos_to_add)
+ random_sample_size = random_sample_size - num_of_template_reads
+ return add_positions
+ add_positions = random.sample(range(0, num_of_template_reads), num_of_donor_reads)
+ return add_positions
+
+ def write_donor_output_bam(self, bamoutpath, donorreads, acceptorbam):
+ """Writes a set of donor reads as a bam file.
+
+ Parameters
+ ----------
+ bamoutpath : str
+ Path and name to write the output to
+ donorreads : list of DonorBamRead
+ Donor reads to place in the output BAM file
+ acceptorbam
+ """
+ print("Constructing BAM file")
+
+ def read_is_hard_clipped(self, fetchedread):
+ """Returns whether the provided read is hard-clipped
+
+ Parameters
+ ----------
+ fetchedread : pysam.AlignedSegment
+ Read fetched from alignment file with pysam
+
+ Returns
+ -------
+ bool
+ True if read is hardclipped, False if not or read has no cigar string
+ """
+ if fetchedread.cigarstring is not None:
+ return "H" in fetchedread.cigarstring
+ return False
+
+ def check_template_size(self, templatefqloc):
+ """Returns the number of reads in the template fastq file.
+
+ The returned number is divided by 4 as each read entry consists of four lines.
+
+ Parameters
+ ----------
+ templatefqloc : str
+ Path to acceptor fastq file to check
+
+ Returns
+ -------
+ int
+ Number of read entries in the template fastq file
+ """
+ line_count = 0
+ with gzip.open(templatefqloc, "r") as templatefq:
+ for fileline in templatefq:
+ line_count += 1
+ return int(line_count/4)
+
+ def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_context_reads, forward_or_reverse,
+ vasefq_outpath, random_seed, donor_read_insert_data):
+ """Builds and writes a set of validation fastq files.
+
+ A set of validation fastq files is build using a set of template/acceptor fastq files. Acceptor reads will be
+ filtered from these file via read identifier and donor reads will be added. Donor readsd reads will be added at
+ semi random positions. The random.seed() method is used to ensure reproducibility as using the same data (in
+ the same order) and the same seed results in the same shuffle.
+
+ Parameters
+ ----------
+ acceptorfq_filepaths : list of str
+ Paths to template fastq files to use
+ acceptorreads_toskip : list of str
+ Identifiers of acceptor reads to skip
+ donor_context_reads : list of tuple
+ Donor reads to add to the validation fastq files
+ forward_or_reverse : str
+ Write forward ('1') or reverse ('2')
+ vasefq_outpath :
+ Path to write VaSeBuilder validation fastq files to
+ """
+ # Split all donor reads to add over the template fastq files
+ donor_read_ids = [x[0] for x in donor_context_reads]
+ donor_read_ids.sort()
+ donor_read_ids = list(set(donor_read_ids))
+ self.vaselogger.debug(f"Donor read ids: {donor_read_ids}")
+ distributed_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(acceptorfq_filepaths))
+ self.vaselogger.debug(f"Distributed read ids: {distributed_read_ids}")
+
+ # Iterate over the R1/R2 fastq in files to use as templates for the
+ for x in range(0, len(acceptorfq_filepaths)):
+ # Collect the donor reads to write.
+ add_donor_ids = distributed_read_ids[x]
+ self.vaselogger.debug(f"Distributed read ids: {add_donor_ids}")
+ add_donor_reads = [x for x in donor_context_reads if x[0] in add_donor_ids]
+ self.vaselogger.debug(f"Will add {len(add_donor_ids)} donor reads")
+
+ # Write the new VaSe FastQ file.
+ vasefq_outname = self.set_fastq_out_path(vasefq_outpath, forward_or_reverse, x + 1)
+ self.vaselogger.debug(f"Set FastQ output path to: {vasefq_outname}")
+
+ # Check whether to add the fastq file name into the donor read insert position map
+ if vasefq_outname.split(".")[0][:-3] not in donor_read_insert_data:
+ donor_read_insert_data[vasefq_outname.split(".")[0][:-3]] = {}
+
+ self.write_vase_fastq_v2(acceptorfq_filepaths[x], vasefq_outname,
+ acceptorreads_toskip, add_donor_reads, add_donor_ids,
+ forward_or_reverse, random_seed, donor_read_insert_data)
+
+ # Builds a new FastQ file to be used for validation.
+ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath,
+ acceptorreads_toskip, donorbamreaddata,
+ donor_readids, fr, random_seed, donor_read_insert_data):
+ """Creates and writes a single VaSeBuilder validation fastq file.
+
+ Parameters
+ ----------
+ acceptor_infq : str
+ Path to
+ fastq_outpath : str
+ Path to write VaSeBuilder vaildation fastq file to
+ acceptorreads_toskip : list of str
+ Acceptor read identifiers to exclude from the validation fastq file
+ donorbamreaddata : list of tuple
+ Donor reads to writes
+ fr : str
+ Forward('1') or reverse ('2') fastq file
+ """
+ fastq_prefix = fastq_outpath.split(".")[0][:-3]
+ try:
+ fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb"))
+ self.vaselogger.debug(f"Writing data to validation fastq {fastq_outpath}")
+
+ cur_line_index = 0 # Current read position in the template fastq
+ cur_add_index = 0 # Current read position in the validation fastq
+
+ # Determine where to semi randomly add the donor reads in the fastq
+ num_of_template_reads = self.check_template_size(acceptor_infq)
+ self.vaselogger.debug(f"Template has {num_of_template_reads} reads")
+ donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donor_readids),
+ random_seed)
+ # self.vaselogger.debug(f"Add positions for {fastq_outpath} = {donor_add_positions}")
+ donor_reads_to_addpos = self.link_donor_addpos_reads_v2(donor_add_positions, donor_readids,
+ donorbamreaddata)
+ # self.vaselogger.debug(f"Read to add pos for {fastq_outpath} = {donor_reads_to_addpos}")
+
+ # Open the template fastq and write filtered data to a new fastq.gz file.
+ fqgz_infile = io.BufferedReader(gzip.open(acceptor_infq, "rb"))
+ self.vaselogger.debug(f"Opened template FastQ: {acceptor_infq}")
+ for fileline in fqgz_infile:
+
+ # Check if we are located at a read identifier.
+ if fileline.startswith(b"@"):
+ if fileline.decode("utf-8").split()[0][1:] not in acceptorreads_toskip:
+ fqgz_outfile.write(fileline)
+ fqgz_outfile.write(next(fqgz_infile))
+ fqgz_outfile.write(next(fqgz_infile))
+ fqgz_outfile.write(next(fqgz_infile))
+ cur_add_index += 1
+ else:
+ self.vaselogger.debug(f"Skipping acceptor read {fileline}")
+
+ # Check if we need to add a donor read at the current position
+ if cur_line_index in donor_reads_to_addpos:
+ # self.vaselogger.debug(f"{cur_line_index} is in list of positions to add donor")
+ for donorread in donor_reads_to_addpos[cur_line_index]:
+ if donorread[1] == fr:
+ fqlines = ("@" + str(donorread[0]) + "\n"
+ + str(donorread[2]) + "\n"
+ + "+\n"
+ + str(donorread[3]) + "\n")
+ fqgz_outfile.write(fqlines.encode("utf-8"))
+ cur_add_index += 1
+ self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at "
+ f"{cur_add_index}")
+ self.add_donor_insert_data(fastq_prefix, donorread[0], fr, cur_add_index,
+ donor_read_insert_data)
+ cur_line_index += 1
+ fqgz_infile.close()
+
+ fqgz_outfile.flush()
+ fqgz_outfile.close()
+
+ except IOError as ioe:
+ if ioe.filename == acceptor_infq:
+ self.vaselogger.critical("The supplied template FastQ file "
+ "could not be found.")
+ if ioe.filename == fastq_outpath:
+ self.vaselogger.critical("A FastQ file could not be written "
+ "to the provided output location.")
+ exit()
+
+ def link_donor_addpos_reads(self, donor_addpos, donor_reads):
+ """Links and returns donor add positions and donor reads.
+
+ Parameters
+ ----------
+ donor_addpos : list of int
+ Positions in validation fastq to add donor reads at
+ donor_reads : list of tuple
+ Donor reads to add to validation fastq
+
+ Returns
+ -------
+ add_posread_link : dict of list
+ Donor reads to add per add position
+ """
+ add_posread_link = {}
+ for addpos, dread in zip(donor_addpos, donor_reads):
+ if addpos not in add_posread_link:
+ add_posread_link[addpos] = []
+ add_posread_link[addpos].append(dread)
+ # self.vaselogger.debug(f"Added {dread[0]}/{dread[1]} at insert pos {addpos}")
+ return add_posread_link
+
+ def link_donor_addpos_reads_v2(self, donor_addpos, donor_read_ids, donor_reads):
+ """Links and returns donor add positions and donor reads.
+
+ Parameters
+ ----------
+ donor_addpos : list of int
+ Positions in validation fastq to add donor reads at
+ donor_reads : list of tuple
+ Donor reads to add to validation fastq
+
+ Returns
+ -------
+ add_posread_link : dict of list
+ Donor reads to add per add position
+ """
+ add_posread_link = {}
+ for addpos, dread_id in zip(donor_addpos, donor_read_ids):
+ if addpos not in add_posread_link:
+ add_posread_link[addpos] = []
+ add_posread_link[addpos].extend([x for x in donor_reads if x[0] == dread_id])
+ return add_posread_link
+
+ def read_donor_fastq(self, donor_fastq, forward_reverse, donor_read_data):
+ """Reads and saves the donor fastq reads.
+
+ Parameters
+ ----------
+ donor_fastq : str
+ Path to the donor fastq file to read
+ forward_reverse : str
+ Reading forward (R1) or reverse (R2) donor read data
+ donor_read_data : dict
+ Dictionary saving the donor read data
+
+ Returns
+ -------
+ donor_read_data : dict
+ Updated dictionary containing donor read data
+ """
+ try:
+ with gzip.open(donor_fastq, 'rt') as dfqfile:
+ for dfqfileline in dfqfile:
+ if dfqfileline.startswith("@"):
+ dfq_readid = dfqfileline[1:].strip()
+ dfq_readseq = next(dfqfile).strip()
+ dfq_optline = next(dfqfile).strip()
+ dfq_readqual = next(dfqfile).strip()
+ if dfq_readid not in donor_read_data:
+ donor_read_data[dfq_readid] = []
+ donor_read_data[dfq_readid].append((dfq_readid, forward_reverse, dfq_readseq, dfq_readqual))
+ except IOError:
+ self.vaselogger.debug(f"Could not read donor fastq file {donor_fastq}")
+ finally:
+ return donor_read_data
+
+ # Temporary new AC-mode method to test semi random read distribution
+ def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, random_seed, outpath):
+ self.vaselogger.info("Running VaSeBuilder AC-mode")
+ # Split the donor fastqs into an R1 and R2 group
+ r1_dfqs = [dfq[0] for dfq in dfqs]
+ r2_dfqs = [dfq[1] for dfq in dfqs]
+
+ # Get the list of acceptor reads to exclude
+ skip_list = varconfile.get_all_variant_context_acceptor_read_ids()
+ skip_list.sort()
+
+ # Read all the donor read fastq data
+ donor_reads = {}
+ for fr_i, dfq_i in zip(["1", "2"], [r1_dfqs, r2_dfqs]):
+ for x in range(len(dfq_i)):
+ donor_reads = self.read_donor_fastq(dfq_i[x], fr_i, donor_reads)
+ # self.vaselogger.debug(f"ACmodeV2 Donor reads: {donor_reads}")
+
+ # Divide the donor reads equally over the template fastq files
+ donor_read_ids = list(donor_reads.keys())
+ donor_read_ids.sort()
+ # self.vaselogger.debug(f"ACmodeV2 dread ids: {donor_read_ids}")
+
+ # Distribute the read identifiers over the fastq files
+ distributed_donor_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(afq1_in))
+ # self.vaselogger.debug(f"ACmodeV2 distr dread ids: {distributed_donor_read_ids}")
+
+ # Iterate over the acceptor fastq files
+ donor_read_add_data = {}
+ for fr_i, afq_i in zip(["1", "2"], [afq1_in, afq2_in]):
+ self.build_fastqs_from_donors_v2(afq_i, skip_list, distributed_donor_read_ids, donor_reads, fr_i,
+ random_seed, outpath, donor_read_add_data)
+ self.write_donor_insert_positions_v2(donor_read_add_data, f"{outpath}_donor_read_insert_positions.txt")
+
+ # Temporary new build_fastqs_from_donors() to test semi random read distribution in AC-mode
+ def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, distributed_donor_reads,
+ donor_reads, forward_reverse, random_seed, outpath, donor_read_insert_data):
+ """Builds a set of R1/R2 validation fastq files using existing donor fastq files.
+
+ Parameters
+ ----------
+ acceptor_fqsin: list of str
+ Template/Acceptor fastq files to use
+ acceptor_reads_to_exclude: list of str
+ distributed_donor_reads : list of list of str
+ donor_reads: dict
+ forward_reverse: str
+ Whether to construct an R1 or R2 fastq file
+ random_seed: int
+ Random seed to use for shuffling donor add positions
+ outpath: str
+ Path top write produced fastq file to
+ donor_read_insert_data:
+ """
+ for x in range(len(acceptor_fqsin)):
+ fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x + 1)
+ self.vaselogger.debug(f"bfmdV2 xdistr dread ids: {distributed_donor_reads[x]}")
+
+ # Add the fastq file entry to the insert positions map
+ if fqoutname.split(".")[0][:-3] not in donor_read_insert_data:
+ donor_read_insert_data[fqoutname.split(".")[0][:-3]] = {}
+
+ # Filter the donor reads specific to the template file
+ selected_donor_reads = [donor_reads[y] for y in distributed_donor_reads[x]]
+ donor_reads_to_add = []
+ [donor_reads_to_add.extend(x) for x in selected_donor_reads]
+
+ # self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}")
+ self.write_vase_fastq_v2(acceptor_fqsin[x], fqoutname, acceptor_reads_to_exclude, donor_reads_to_add,
+ distributed_donor_reads[x], forward_reverse, random_seed, donor_read_insert_data)
+
+ def write_donor_insert_positions_v2(self, inserted_position_data, outpath):
+ """Writes the insert positions for each set of reads.
+
+ Insert positions are written per read identifier.
+
+ Parameters
+ ----------
+ inserted_position_data : dict
+ Position data
+ outpath : str
+ Path and name to write the donor read insert position data to
+ """
+ try:
+ with open(outpath, "w") as ipd_outfile:
+ ipd_outfile.write(f"#VBUUID: {self.creation_id}\n")
+ ipd_outfile.write("ReadId\tR1_InsertPos\tFastqR1Out\tR2_InsertPos\tFastqR2Out\n")
+
+ # Iterate over the donor read insert position data and write to file
+ for fastqout in inserted_position_data:
+ readid_list = [x for x in inserted_position_data[fastqout]]
+ readid_list.sort()
+
+ # Iterate over each read identifier for the specified output R1 and R2 fastq set.
+ for readid in readid_list:
+ r1_insertpos = self.get_saved_insert_position("1", inserted_position_data[fastqout][readid])
+ r2_insertpos = self.get_saved_insert_position("2", inserted_position_data[fastqout][readid])
+
+ ipd_outfile.write(f"{readid}\t{r1_insertpos}\t{fastqout}_R1.fastq\t{r2_insertpos}\t"
+ f"{fastqout}_R2.fastq\n")
+ except IOError:
+ self.vaselogger.warning(f"Could not write donor insert position data to {outpath}")
+
+ def get_saved_insert_position(self, readpn, read_insert_data):
+ """Returns the insert position of a read, or 'NA' if not available.
+
+ Parameters
+ ----------
+ readpn : str
+ The readpair number ('1' or '2')
+ read_insert_data : tuple of str and int
+ The read insert data
+
+ Returns
+ -------
+ int or str
+ The insert position, 'NA' if not available
+ """
+ if readpn in read_insert_data:
+ readpn_index = read_insert_data.index(readpn)
+ if readpn_index < len(read_insert_data)-1:
+ return read_insert_data[read_insert_data.index(readpn)+1]
+ return "NA"
+
+ def add_donor_insert_data(self, fqoutname, readid, forward_reverse, insertpos, donor_insert_data):
+ """
+
+ Parameters
+ ----------
+ fqoutname : str
+ Name of fastq file
+ readid : str
+ Read identifier for which to save the insert position
+ forward_reverse : str
+ Indicator whether the read is R1 or R2
+ insertpos : int
+ Position the donor read was inserted at in the validation fastq
+ donor_insert_data: dict
+ Saved donor read insertions into validation fastq
+ """
+ if fqoutname in donor_insert_data:
+ if readid in donor_insert_data[fqoutname]:
+ if forward_reverse not in donor_insert_data[fqoutname][readid]:
+ donor_insert_data[fqoutname][readid] = donor_insert_data[fqoutname][readid] + \
+ (forward_reverse, insertpos)
+ else:
+ donor_insert_data[fqoutname][readid] = (forward_reverse, insertpos)
diff --git a/VaSeUtils/SubsetVcfByVariantContext.py b/VaSeUtils/SubsetVcfByVariantContext.py
new file mode 100644
index 0000000..9168293
--- /dev/null
+++ b/VaSeUtils/SubsetVcfByVariantContext.py
@@ -0,0 +1,11 @@
+import logging
+from VariantContextFile import VariantContextFile
+
+class SubsetVcfByVariantContext:
+ def __init__(self):
+ print("aap")
+
+ def main(self, varconinloc):
+ varconfile = VariantContextFile(varconinloc)
+
+
diff --git a/VaSeUtils/VaSeUtilHelper.py b/VaSeUtils/VaSeUtilHelper.py
index f0f33c2..be02d67 100644
--- a/VaSeUtils/VaSeUtilHelper.py
+++ b/VaSeUtils/VaSeUtilHelper.py
@@ -6,6 +6,39 @@
class VaSeUtilHelper:
def __init__(self):
self.vaseutillogger = logging.getLogger("VaSeUtil_Logger")
+ self.parameter_map = {"-m": "RUNMODE",
+ "--runmode": "RUNMODE",
+ "-v": "DONORVCF",
+ "--donorvcf": "DONORVCF",
+ "-b": "DONORBAM",
+ "--donorbam": "DONORBAM",
+ "-a": "ACCEPTORBAM",
+ "--acceptorbam": "ACCEPTORBAM",
+ "-1": "TEMPLATEFQ1",
+ "--templatefq1": "TEMPLATEFQ1",
+ "-2": "TEMPLATEFQ2",
+ "--templatefq2": "TEMPLATEFQ2",
+ "-o": "OUT",
+ "--out": "OUT",
+ "-r": "REFERENCE",
+ "--reference": "REFERENCE",
+ "-of": "",
+ "--fastqout": "",
+ "-ov": "",
+ "--varcon": "",
+ "-l": "LOG",
+ "--log": "LOG",
+ "-!": "DEBUG",
+ "-vl": "VARIANTLIST",
+ "--variantlist": "VARIANTLIST",
+ "-iv": "VARCONIN",
+ "--varconin": "VARCONIN",
+ "-dq": "DONORFASTQS",
+ "--donorfastqs": "DONORFASTQS",
+ "-c": "CONFIG",
+ "--config": "CONFIG",
+ "-s": "SEED",
+ "--seed": "SEED"}
# Returns whether something is in the filter or not
def passes_filter(self, valtocheck, filterlist):
@@ -116,3 +149,45 @@ def get_read_pair_num(self, pysam_bamread):
if pysam_bamread.is_read1:
return "1"
return "2"
+
+ def determine_variant_type(self, vcfvariantref, vcfvariantalts):
+ """Determines and returns the
+
+ Parameters
+ ----------
+ vcfvariantref : str
+ Variant reference allele(s)
+ vcfvariantalts : tuple of str
+ Variant alternative allele(s)
+
+ Returns
+ -------
+ str
+ Type of variant (snp/indel)
+ """
+ maxreflength = max([len(x) for x in vcfvariantref.split(",")])
+ maxaltlength = max([len(x) for x in vcfvariantalts])
+
+ # Check based on the reference and alternative lengths whether the variant is a SNP or indel.
+ if maxreflength == 1 and maxaltlength == 1:
+ return "snp"
+ elif maxreflength > 1 or maxaltlength > 1:
+ return "indel"
+ return "?"
+
+ def get_config_param_name(self, paramflag):
+ """Returns the config parameter name for a parameter flag.
+
+ Parmeters
+ ---------
+ paramflag : str
+ Parameter flag to obtain config parameter
+
+ Returns
+ -------
+ str
+ Config parameter name if parameter flag is in map, empty string otherwise
+ """
+ if paramflag in self.parameter_map:
+ return self.parameter_map[paramflag]
+ return ""
diff --git a/VaSeUtils/VaSeUtilRunner.py b/VaSeUtils/VaSeUtilRunner.py
new file mode 100644
index 0000000..73bc9c8
--- /dev/null
+++ b/VaSeUtils/VaSeUtilRunner.py
@@ -0,0 +1,113 @@
+import os
+from VariantContextFile import VariantContextFile
+from VaSeUtilHelper import VaSeUtilHelper
+
+
+class VaSeUtilRunner:
+ def __init__(self):
+ self.vuh = VaSeUtilHelper()
+
+ def check_donor_files_exist(self, donorlistfile, samplefilter=None):
+ """Checks whether the donor files used in a VaSeBuilder run still exist.
+
+ Parameters
+ ----------
+ donorlistfile : str
+ File with list of used donor files n VaSeBuilder run
+ samplefilter : list of str
+ Filter with sample names
+ """
+ missing_count = 0
+ total_count = 0
+ try:
+ with open(donorlistfile, "r") as dlfile:
+ next(dlfile) # Skip the header line of the file
+ for fileline in dlfile:
+ filelinedata = fileline.strip().split("\t")
+ donorfiles = filelinedata[1].split(",")
+
+ # Check whether the used donor files exist
+ if self.vuh.passes_filter(filelinedata[0], samplefilter):
+ for dfile in donorfiles:
+ total_count += 1
+ if not os.path.isfile(dfile):
+ missing_count += 1
+ print(f"Donor file {dfile} for sample {filelinedata[0]} could not be found. Maybe it "
+ "has been moved, renamed or deleted?")
+ print(f"Found {total_count - missing_count}/{total_count} donor files.")
+ except IOError:
+ print(f"Could not open {donorlistfile}")
+ finally:
+ print("Finished running VaSe util CheckDonorFilesExist")
+
+ def combine_variant_context_file(self):
+ print("aap")
+
+ def generate_config_file_from_command(self, vasebuilder_command):
+ print("aap")
+
+ def log_info(self, vaselogloc, logfilter):
+ """Displays log entries satisfying the set log level filter.
+
+ Parameters
+ ----------
+ vaselogloc:
+ Path to VaSeBuilder log file
+ logfilter: list of str
+ Filter with log level(s)
+ """
+ try:
+ with open(vaselogloc, 'r') as vaselogfile:
+ for fileline in vaselogfile:
+ fileline_elements = fileline.split("\t")
+ if len(fileline_elements) >= 3:
+ if self.vuh.passes_filter(fileline_elements[2], logfilter):
+ print(fileline.strip())
+ except IOError as ioe:
+ print(f"Could not open log file")
+
+ def subset_vcf(self, varconfile, vcffileloc):
+ """Subsets a single VCF file. The filtered entries are written to a new file.
+
+ varconfile : VariantContextFile
+ Variant contexts to use as filter
+ vcffileloc : str
+ Path to the VCF file to filter
+ """
+ tmp_data = vcffileloc.split(".")
+ out_name = f"{tmp_data[0]}.varconfiltered." + ".".join(tmp_data[1:])
+ try:
+ vcf_file = pysam.VariantFile(vcffileloc, "r")
+ filtered_vcf_file = pysam.VariantFile(out_name, "w", header=vcf_file.header)
+
+ # Iterate over the VCF file that to be filtered nd write variants overlapping with a context
+ for vcfvariant in vcf_file.fetch():
+ varianttype = self.vuh.determine_variant_type(vcfvariant.ref, vcfvariant.alts)
+ max_ref = max(len(x) for x in vcfvariant.ref.split(","))
+ max_alt = max(len(x) for x in vcfvariant.alts)
+ var_endpos = max([max_ref, max_alt])
+ if varconfile.variant_is_in_context(varianttype, vcfvariant.chrom, vcfvariant.start-1,
+ vcfvariant.stop+1):
+ filtered_vcf_file.write(vcfvariant)
+
+ filtered_vcf_file.close()
+ vcf_file.close()
+ except IOError:
+ print(f"Could not process variant file {vcffileloc}")
+
+ def subset_acceptor_vcf(self):
+ print("aap")
+
+ def subset_vcf_by_variant_contexts(self, variantcontextfile, vcffile_list):
+ """Subsets a filter of VCF files using a variant context files.
+
+ Parameters
+ ----------
+ variantcontextfile : VariantContextFile
+ Variant context file to use as filter
+ vcffile_list : list of str
+ List of VCF files to filter
+ """
+ varconfile = VariantContextFile(variantcontextfile)
+ for vcffile in vcffile_list:
+ self.subset_vcf(varconfile, vcffile)
diff --git a/VariantContext.py b/VariantContext.py
index 467350c..ef0d0c5 100644
--- a/VariantContext.py
+++ b/VariantContext.py
@@ -3,64 +3,173 @@
class VariantContext:
- # Sets the variant context data.
+ """Saves a variant context and allows data to be obtained and calculated.
+
+ Attributes
+ ----------
+ context_id : str
+ Identifier of the context
+ sample_id : str
+ The name/identifier of the donor sample used to construct the variant context
+ variant_context_chrom : str
+ Chromosome name the context is lcoated on
+ variant_context_origin : int
+ Variant position the context is constructed from
+ variant_context_start : int
+ Leftmost genomic position of the variant context
+ variant_context_end : int
+ Rightmost genomic position of the variant context
+ variant_context_areads : list of DonorBamRead
+ Acceptor reads associated with the variant context
+ variant_context_dreads : list of DonorBamRead
+ Donor reads associated with the variant context
+ variant_acceptor_context : OverlapContext
+ Acceptor context used to construct the variant context
+ variant_donor_context : OverlapContext
+ Donor context used to construct the variant context
+ unmapped_donor_mate_ids : list of str
+ Identifiers of reads with an unmapped mate
+ """
+
def __init__(self, varconid, sampleid,
varconchrom, varconorigin,
varconstart, varconend,
acceptorreads, donorreads,
acceptor_context=None, donor_context=None):
+ """Saves the provided variant context data.
+
+ Acceptor and donor contexts are optional when creating the variant context.
+
+ Parameters
+ ----------
+ varconid : str
+ Variant context identifier
+ sampleid : str
+ Sample name/identifier
+ varconchrom : str
+ Chromosome name the variant context is located on
+ varconorigin:
+ Variant position the context is constructed from
+ varconstart : int
+ Leftmost genomic position of the variant context
+ varconend : int
+ Rightmost genomic position of the variant context
+ acceptorreads : list of DonorBamRead
+ Variant context acceptor reads
+ donorreads : list of DonorBamRead
+ Variant context donor reads
+ acceptor_context : OverlapContext
+ Acceptor context associated with the variant context
+ donor_context : OverlapContext
+ Donor context associated with the variant context
+ """
self.context_id = varconid
self.sample_id = sampleid
self.variant_context_chrom = varconchrom
self.variant_context_origin = int(varconorigin)
self.variant_context_start = int(varconstart)
self.variant_context_end = int(varconend)
- # Saves the acceptor reads that overlap with the entire variant
- # context.
+ # Saves the acceptor reads that overlap with the entire variant context.
self.variant_context_areads = acceptorreads
- # Saves the donor reads that overlap with the entire variant
- # context.
+ # Saves the donor reads that overlap with the entire variant context.
self.variant_context_dreads = donorreads
- # Saves the acceptor context (this is the context from acceptor
- # reads overlapping with the variant itself).
+ # Saves the acceptor context (this is the context from acceptor reads overlapping with the variant itself).
self.variant_acceptor_context = acceptor_context
- # Saves the donor context (this is the context from donor reads
- # overlapping with the variant itself).
+ # Saves the donor context (this is the context from donor reads overlapping with the variant itself).
self.variant_donor_context = donor_context
self.unmapped_acceptor_mate_ids = []
self.unmapped_donor_mate_ids = []
# ===METHODS TO OBTAIN DATA FROM THE VARIANT CONTEXT DATA==================
- # Returns the variant context identifier.
+ def get_context(self):
+ """Returns the essential data of the variant context.
+
+ Returns
+ -------
+ list of str and int
+ Variant context window
+ """
+ return [self.variant_context_chrom, self.variant_context_origin, self.variant_context_start,
+ self.variant_context_end]
+
def get_variant_context_id(self):
+ """Returns the variant context identifier.
+
+ Returns
+ -------
+ self.context_id : str
+ Variant context identifier
+ """
return self.context_id
- # Returns the variant context sample.
def get_variant_context_sample(self):
+ """Returns the name/identifier of the sample
+
+ Returns
+ -------
+ self.sample_id : str
+ Sample name/identifier
+ """
return self.sample_id
- # Returns the variant context chromosome.
def get_variant_context_chrom(self):
+ """Returns the chromosome name the variant context is located on.
+
+ Returns
+ -------
+ self.variant_context_chrom : str
+ Chromosome name of the variant context
+ """
return self.variant_context_chrom
- # Returns the variant context origin.
def get_variant_context_origin(self):
+ """Returns the genomic position of the variant used to construct the variant context.
+
+ Returns
+ -------
+ self.variant_context_origin : int
+ Variant genomic position
+ """
return self.variant_context_origin
- # Returns the variant context start position.
def get_variant_context_start(self):
+ """Returns the leftmost genomic position of the variant context.
+
+ Returns
+ -------
+ self.variant_context_start
+ Variant context starting position
+ """
return self.variant_context_start
- # Returns the variant context end position.
def get_variant_context_end(self):
+ """Returns the variant context end position.
+
+ Returns
+ -------
+ self.variant_context_end : int
+ Variant context rightmost genomic position
+ """
return self.variant_context_end
- # Returns the variant context acceptor reads.
def get_acceptor_reads(self):
+ """Returns the variant context acceptor reads.
+
+ Returns
+ -------
+ self.variant_context_areads : list of DonorBamRead
+ Variant context acceptor reads
+ """
return self.variant_context_areads
- # Returns the variant context donor reads.
def get_donor_reads(self):
+ """Returns the variant context donor reads.
+
+ Returns
+ -------
+ self.variant_context_dreads : list of DonorBamRead
+ Variant context acceptor reads
+ """
return self.variant_context_dreads
# TODO: New thing for -P mode.
@@ -73,130 +182,264 @@ def get_donor_read_strings(self):
dbr.get_bam_read_qual()))
return list(set(donorreads))
- # Returns the acceptor context (the context from acceptor reads
- # overlapping the variant itself).
def get_acceptor_context(self):
+ """Returns the acceptor context used to construct the variant context.
+
+ Returns
+ -------
+ self.variant_acceptor_context : OverlapContext
+ Acceptor context associated with the variant context
+ """
return self.variant_acceptor_context
- # Returns the donor context (the context from donor reads
- # overlapping the variant itself).
def get_donor_context(self):
+ """Returns the donor context used to construct the variant context.
+
+ Returns
+ -------
+ self.variant_donor_context : OverlapContext
+ Donor context associated with the variant context
+ """
return self.variant_donor_context
- # Returns a list of acceptor reads overlapping with the variant
- # context.
def get_unmapped_acceptor_mate_ids(self):
+ """Returns the identifiers of variant context acceptor reads that have an unmapped mate.
+
+ Returns
+ -------
+ self.unmapped_acceptor_mate_ids : list of str
+ Variant context acceptor read identifiers with unmapped mates
+ """
return self.unmapped_acceptor_mate_ids
- # Returns a list of donor reads overlapping with the variant
- # context.
+ # Returns a list of donor reads overlapping with the variant context.
def get_unmapped_donor_mate_ids(self):
+ """Returns the identifiers of variant context donor reads with an unmapped mate.
+
+ Returns
+ -------
+ self.unmapped_donor_mate_ids : list of str
+ Variant contexct donor read identifiers with an unmapped mate
+ """
return self.unmapped_donor_mate_ids
- # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE===============
- # ===VARIANT CONTEXT===
- # Returns the variant context length.
+ # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE VARIANT CONTEXT===============
def get_variant_context_length(self):
+ """Returns the length of the variant context.
+
+ The length if determined by subtracting the leftmost genomic (start) position from the rightmost genomic (end)
+ position.
+
+ Returns
+ -------
+ int
+ Variant context length
+ """
return abs(self.variant_context_end - self.variant_context_start)
- # Returns the distance of the variant context start position from
- # the variant context origin.
def get_start_distance_from_origin(self):
+ """Calculates and returns the variant context start from the variant position that constructed it.
+
+ Returns
+ -------
+ int
+ Distance between variant context leftmost genomic and variant position.
+ """
return abs(self.variant_context_origin - self.variant_context_start)
- # Returns the distance of the variant context end position from the
- # variant context origin.
def get_end_distance_from_origin(self):
+ """Calculates and returns the variant context end from the variant position tha constructed it.
+
+ Returns
+ -------
+ int
+ Distance between variant context rightmost genomic and variant position
+ """
return abs(self.variant_context_end - self.variant_context_origin)
# ===METHODS TO OBTAIN VARIANT CONTEXT ACCEPTOR READ DATA==================
- # Returns the number of variant context acceptor reads.
def get_number_of_acceptor_reads(self):
+ """Returns the number of variant context acceptor reads.
+
+ Returns
+ -------
+ int
+ Number of variant context acceptor reads
+ """
if self.variant_context_areads is None:
return 0
return len(self.variant_context_areads)
- # Returns the identifiers of acceptor reads overlapping with the
- # variant context.
def get_acceptor_read_ids(self):
+ """Returns the variant context acceptor read identifiers
+
+ Returns
+ -------
+ list of str or None
+ Variant context acceptor read identifiers, None if there are no acceptor reads
+ """
if self.variant_context_areads is None:
return [None]
return list(set([x.get_bam_read_id() for x in self.variant_context_areads]))
- # Returns the list of left most acceptor read positions,
def get_acceptor_read_starts(self):
+ """Returns the variant context acceptor read starting positions
+
+ Returns
+ -------
+ list of int or None
+ Variant context acceptor read leftmost genomic positions, None if there are no acceptor reads
+ """
if self.variant_context_areads is None:
return [None]
return [x.get_bam_read_ref_pos() for x in self.variant_context_areads]
- # Returns a list of the left most positions of the R1 variant
- # context accecptor BAM reads.
def get_acceptor_read_left_positions(self):
+ """Returns the leftmost genomic positions of all variant context R1 acceptor reads.
+
+ Returns
+ -------
+ list of int or None
+ Variant context R1 acceptor read leftmost genomic positions, None of there are no acceptor reads
+ """
if self.variant_context_areads is None:
return [None]
return [x.get_bam_read_ref_pos()
for x in self.variant_context_areads if x.is_read1()]
- # Returns the list of all end positions for all variant context
- # acceptor reads.
def get_acceptor_read_ends(self):
+ """Returns the variant context acceptor read rightmost positions.
+
+ Returns
+ -------
+ list of int or None
+ Variant context R2 acceptor read rightmost genomic positions, None if there are no acceptor reads
+ """
if self.variant_context_areads is None:
return [None]
return [x.get_bam_read_ref_end() for x in self.variant_context_areads]
- # Returns a list of the right most positions ofr the R2 variant
- # context acceptor BAM read.
def get_acceptor_read_right_positions(self):
+ """Returns the rightmost genomic positions for all variant context R2 acceptor reads
+
+ Returns
+ -------
+ list of int or None
+ Variant context
+ """
if self.variant_context_areads is None:
return [None]
return [x.get_bam_read_ref_end()
for x in self.variant_context_areads if x.is_read2()]
# ===METHODS TO OBTAIN VARIANT CONTEXT DONOR READ DATA=====================
- # Returns the number of variant context donor reads.
def get_number_of_donor_reads(self):
+ """Returns the number of variant context donor reads.
+
+ Returns
+ -------
+ int
+ Number of variant context donor reads
+ """
return len(self.variant_context_dreads)
- # Returns the identifiers of donor reads overlapping with the
- # variant context.
def get_donor_read_ids(self):
+ """Returns the identifiers of donor reads overlapping with the variant context.
+
+ Returns
+ -------
+ list of str
+ Variant context donor read identifiers
+ """
return list(set([x.get_bam_read_id() for x in self.variant_context_dreads]))
- # Returns the list of variant context donor read starting positions.
def get_donor_read_starts(self):
+ """Returns the list of variant context donor read starting positions.
+
+ Returns
+ -------
+ list of int
+ Variant context donor read leftmost genomic positions
+ """
return [x.get_bam_read_ref_pos() for x in self.variant_context_dreads]
- # Returns the list of variant context donor read pairs left most
- # positions (start pos of read 1).
def get_donor_read_left_positions(self):
+ """Returns the list of variant context R1 donor read leftmost positions
+
+ Returns
+ -------
+ list of int
+ Variant context R1 donor read leftmost genomic positions
+ """
return [x.get_bam_read_ref_pos()
for x in self.variant_context_dreads if (x.is_read1())]
# Returns a list of all donor read ending positions
def get_donor_read_ends(self):
+ """Returns variant context donor read rightmost positions.
+
+ Returns
+ -------
+ list of int
+ Variant context donor read rightmost genomic positions
+ """
return [x.get_bam_read_ref_end() for x in self.variant_context_dreads]
- # Returns a list of all variant context donor reads right most
- # positions (end pos of read 2).
def get_donor_read_right_positions(self):
+ """Returns the list of variant context R2 donor reads
+
+ Returns
+ -------
+ list of int
+ Variant context R2 donor reads rightmost genomic positions
+ """
return [x.get_bam_read_ref_end()
for x in self.variant_context_dreads if (x.is_read2())]
# ===METHODS TO ADD DATA TO THE VARIANT CONTEXT============================
- # Sets the acceptor context of the variant context.
def set_acceptor_context(self, acceptor_context):
+ """Sets the acceptor context of the variant context with the one provided.
+
+ Parameters
+ ----------
+ acceptor_context : OverlapContext
+ Acceptor context to add to the variant context
+ """
self.variant_acceptor_context = acceptor_context
- # Sets the donor context of the variant context.
def set_donor_context(self, donor_context):
+ """Sets the donor context of the variant context with the one provided.
+
+ Parameters
+ ----------
+ donor_context : OverlapContext
+ Donor context to add to the variant context
+ """
self.variant_donor_context = donor_context
- # Adds an acceptor context to the variant context by creating it
- # from data values.
def add_acceptor_context(self, contextid, sampleid,
contextchrom, contextorigin,
contextstart, contextend,
acceptorreads):
+ """Sets the acceptor context of the variant context by constructing one from the provided parameters.
+
+ Parameters
+ ----------
+ contextid : str
+ Acceptor context identifier
+ sampleid : str
+ Sample name/identifier
+ contextchrom : str
+ Chromosome name the context is located on
+ contextorigin : int
+ Variant position the context is constructed from
+ contextstart : int
+ Leftmost genomic position of the acceptor context
+ contextend : int
+ Rightmost genomic position of the acceptor context
+ acceptorreads : list of DonorBamRead
+ Acceptor context reads
+ """
self.variant_acceptor_context = OverlapContext(
contextid, sampleid,
contextchrom, contextorigin,
@@ -204,12 +447,29 @@ def add_acceptor_context(self, contextid, sampleid,
acceptorreads
)
- # Adds a donor context to the variant context by creating it from
- # data values.
def add_donor_context(self, contextid, sampleid,
contextchrom, contextorigin,
contextstart, contextend,
donorreads):
+ """Sets the donor context of the variant context by constructing one from the provided parameters.
+
+ Parameters
+ ----------
+ contextid : str
+ Donor context identifier
+ sampleid : str
+ Sample name/identifier
+ contextchrom : str
+ Chromosome name the context is located on
+ contextorigin : int
+ Variant genomic position the context is constructed from
+ contextstart : int
+ Leftmost genomic position of the donor context
+ contextend : int
+ Rightmost genomic position of the donor context
+ donorreads : list of DonorBamRead
+ Donor context reads
+ """
self.variant_donor_context = OverlapContext(
contextid, sampleid,
contextchrom, contextorigin,
@@ -218,82 +478,189 @@ def add_donor_context(self, contextid, sampleid,
)
# ===METHODS TO OBTAIN VARIANT CONTEXT UNMAPPED MATE READ DATA=============
- # Returns the variant context acceptor read ids that have an
- # unmapped mate.
def get_unmapped_acceptor_read_ids(self):
+ """Returns the variant context acceptor read identifiers that have an unmapped mate.
+
+ Returns
+ -------
+ self.unmapped_acceptor_mate_ids : list of str
+ Variant context acceptor read identifiers with an unmapped mate
+ """
return self.unmapped_acceptor_mate_ids
- # Returns the variant context donor read ids that have an unmapped
- # mate.
def get_unmapped_donor_read_ids(self):
+ """Returns the variant context donor read identifiers that have an unmapped mate.
+
+ Returns
+ -------
+ self.unmapped_donor_mate_ids : list of str
+ Variant context donor read identifiers with an unmapped mate
+ :return:
+ """
return self.unmapped_donor_mate_ids
- # Adds a variant context appector mate identifier.
def add_unmapped_acceptor_mate_id(self, mateid):
+ """Adds a variant context appector mate identifier.
+
+ Parameters
+ ----------
+ mateid : str
+ Variant context acceptor read identifier with an unmapped mate
+ """
self.unmapped_acceptor_mate_ids.append(mateid)
- # Adds a variant context donor mate identifier.
def add_unmapped_donor_mate_id(self, mateid):
+ """Adds a variant context donor mate identifier.
+
+ Parameters
+ ----------
+ mateid : str
+ Variant context donor read identifier with an unmapped mate
+ """
self.unmapped_donor_mate_ids.append(mateid)
- # Sets the variant context unmapped acceptor mate ids.
def set_unmapped_acceptor_mate_ids(self, mateids):
+ """Sets the variant context unmapped acceptor mate ids.
+
+ Parameters
+ ----------
+ mateids : list of str
+ Variant context acceptor read identifiers with unmapped mate
+ """
self.unmapped_acceptor_mate_ids = mateids
- # Sets the variant context unmapped donor mate ids.
def set_unmapped_donor_mate_ids(self, mateids):
+ """Sets the variant context unmapped donor mate ids.
+
+ Parameters
+ ----------
+ mateids : list of str
+ Variant context donor read identifiers with unmapped mates
+ """
self.unmapped_donor_mate_ids = mateids
- # Returns whether a specified variant context acceptor read has an
- # unmapped mate.
def acceptor_read_has_unmapped_mate(self, readid):
+ """Returns whether a specified variant context acceptor read has an unmapped mate.
+
+ Parameters
+ ----------
+ readid : str
+ Acceptor read identifier
+
+ Returns
+ -------
+ bool
+ True if acceptor read has unmapped mate, False if not
+ """
return readid in self.unmapped_acceptor_mate_ids
- # Returns whether a specified variant context donor read has an
- # unmapped mate.
def donor_read_has_unmapped_mate(self, readid):
+ """Checks and returns whether a specified variant context donor read has an unmapped mate.
+
+ Parameters
+ ----------
+ readid : str
+ Donor read identifier
+
+ Returns
+ -------
+ bool
+ True if donor read has unmapped mate, False if not
+ """
return readid in self.unmapped_donor_mate_ids
- # Returns the number of variant context acceptor reads with an
- # unmapped mate.
def get_number_of_unmapped_acceptor_mates(self):
+ """Returns the number of variant context acceptor reads with an unmapped mate.
+
+ Returns
+ -------
+ int
+ Number of variant context acceptor reads with an unmapped mate.
+ """
return len(self.unmapped_acceptor_mate_ids)
- # Returns the number of variant context donor reads with an
- # unmapped mate.
def get_number_of_unmapped_donor_mates(self):
+ """Returns the number of variant context donor reads with an unmapped mate.
+
+ Returns
+ -------
+ int
+ Number of variant context donor reads with an unmapped mate.
+ """
return len(self.unmapped_donor_mate_ids)
# ===METHODS TO ADD UNMAPPED MATES TO THE ACCEPTOR AND DONOR CONTEXT=======
- # Sets the unmapped mate ids for the acceptor context.
def set_acceptor_context_unmapped_mates(self, mateids):
+ """Sets the unmapped mate ids for the acceptor context.
+
+ Returns
+ -------
+ mateids : list of str
+ Acceptor context read identifiers with an unmapped mate
+ """
self.variant_acceptor_context.set_unmapped_mate_ids(mateids)
- # Adds an unmapped read id to the acceptor context.
def add_acceptor_context_unmapped_mate(self, ureadid):
+ """Adds an unmapped read id to the acceptor context.
+
+ Parameters
+ ----------
+ ureadid : str
+ Acceptor context read identifier with an unmapped mate
+ """
self.variant_acceptor_context.add_unmapped_mate_id(ureadid)
- # Sets the unmapped mate ids for the donor context.
def set_donor_context_unmapped_mates(self, mateids):
+ """Sets the unmapped mate ids for the donor context.
+
+ Parameters
+ ----------
+ mateids : list of str
+ Donor context read identifiers with an unmapped mate
+ """
self.variant_donor_context.set_unmapped_mate_ids(mateids)
- # Adds an unmapped read id to the donor context.
- def add_donor_context_unmapped_mate_id(self, ureadid):
- self.variant_donor_context.setUnmappedMateId(ureadid)
+ def add_donor_context_unmapped_mate(self, ureadid):
+ """Adds an unmapped read id to the donor context.
+
+ Parameters
+ ----------
+ ureadid : str
+ Donor context read identifier with an unmapped mate
+ """
+ self.variant_donor_context.add_unmapped_mate_id(ureadid)
# ===METHODS TO OBTAIN STATISTICS OF THE VARIANT CONTEXT===================
- # Returns the average and median quality of the acceptor reads
- # associated with
def get_average_and_median_acceptor_read_qual(self):
+ """Returns the average and median quality of the acceptor reads associated with.
+
+ Returns
+ -------
+
+ """
return self.get_average_and_median_read_qual(self.variant_context_areads)
- # Returns the average and median quality of the donor reads
- # associated with this variant context.
def get_average_and_median_donor_read_qual(self):
+ """Calculates and returns the mean and median variant context donor read Q-Score.
+
+ Returns
+ -------
+ list of int
+ Mean and median Q-Score
+ """
return self.get_average_and_median_read_qual(self.variant_context_dreads)
- # Returns the average and median read quality.
def get_average_and_median_read_qual(self, contextreads):
+ """Calculates and returns the mean and median read Q-Score.
+
+ Parameters
+ ----------
+ contextreads : list or reads
+ Reads to calculate mean and median Q-Score of
+
+ Returns
+ -------
+ """
if contextreads is not None:
avgmedqual = []
for contextread in contextreads:
@@ -302,18 +669,39 @@ def get_average_and_median_read_qual(self, contextreads):
statistics.median(avgmedqual)])
return [None, None]
- # Returns the average and median mapq values of the acceptor reads
- # associated with this variant context.
def get_average_and_median_acceptor_read_mapq(self):
+ """Calculates and returns the mean and median variant context acceptor read MAPQ value.
+
+ Returns
+ -------
+ list of int
+ Mean and median MAPQ, None if there are no acceptor reads
+ """
return self.get_average_and_median_read_mapq(self.variant_context_areads)
- # Returns the average and median mapq values of the donor reads
- # associated with this variant context.
def get_average_and_median_donor_read_mapq(self):
+ """Calculates and returns the mean and median variant context donor read MAPQ value.
+
+ Returns
+ -------
+ list of int
+ Mean and median variant context donor read MAPQ, None if there are no donor reads
+ """
return self.get_average_and_median_read_mapq(self.variant_context_dreads)
- # Returns the average and median read MapQ of this variant context.
def get_average_and_median_read_mapq(self, contextreads):
+ """Calculates and returns the mean and median MAPQ value of provided reads.
+
+ Parameters
+ ----------
+ contextreads : list of DonorBamRead
+ Reads to calculate mean and median MAPQ of
+
+ Returns
+ -------
+ list of int
+ Mean and median read MAPQ, None if no reads provided
+ """
if contextreads is not None:
avgmedmapq = []
for contextread in contextreads:
@@ -322,18 +710,39 @@ def get_average_and_median_read_mapq(self, contextreads):
statistics.median(avgmedmapq)])
return [None, None]
- # Returns the average and median length of the acceptor reads
- # associated with this variant context.
def get_average_and_median_acceptor_read_length(self):
+ """Calculates and returns the mean and median variant context acceptor read length.
+
+ Returns
+ -------
+ list of int
+ Mean and median variant context acceptor read length, None if there are no acceptor reads
+ """
return self.get_average_and_median_read_length(self.variant_context_areads)
- # Returns the average and median length of the donor reads
- # associated with this variant context.
def get_average_and_median_donor_read_length(self):
+ """Calculates and returns the mean and median variant context donor read length.
+
+ Returns
+ -------
+ list of int
+ Mean and median variant context donor read length, None if there are no donor reads
+ """
return self.get_average_and_median_read_length(self.variant_context_dreads)
- # Returns the average and median read length.
def get_average_and_median_read_length(self, contextreads):
+ """Calculates and returns the mean and median read length of a specified list of reads.
+
+ Parameters
+ ----------
+ contextreads : list of DonorBamReds
+ Reads to to calculate mean and median length of
+
+ Returns
+ -------
+ list of int
+ Mean and median read length, None if no reads are supplied
+ """
if contextreads is not None:
avgmedlen = []
for contextread in contextreads:
@@ -343,142 +752,335 @@ def get_average_and_median_read_length(self, contextreads):
return [None, None]
# ===METHODS TO OBTAIN ACCEPTOR CONTEXT DATA===============================
- # Returns whether the variant context has an acceptor context
def has_acceptor_context(self):
+ """Returns whether the variant context has an acceptor context
+
+ Returns
+ -------
+ bool
+ True if variant context has an acceptor context, False if not
+ """
return self.variant_acceptor_context is not None
- # Returns the acceptor context identifier (should be the same as the
- # variant context id).
def get_acceptor_context_id(self):
+ """Returns the acceptor context identifier.
+
+ Returns
+ -------
+ str
+ Acceptor context identifier
+ """
return self.variant_acceptor_context.get_context_id()
- # Returns the acceptor context sample id (should be the same as the
- # variant context sample id).
def get_acceptor_context_sample_id(self):
+ """Returns the acceptor context sample id.
+
+ Returns
+ -------
+ str
+ Sample name/identifier of the acceptor context
+ """
return self.variant_acceptor_context.get_sample_id()
- # Returns the chromosome of the acceptor context.
def get_acceptor_context_chrom(self):
+ """Returns the chromosome name of the acceptor context.
+
+ Returns
+ -------
+ str
+ Chromosome name the acceptor context is located on
+ """
return self.variant_acceptor_context.get_context_chrom()
- # Returns the origin position of the acceptor context.
def get_acceptor_context_origin(self):
+ """Returns the origin position of the acceptor context.
+
+ Returns
+ -------
+ int
+ Variant genomic position the context is based on
+ """
return self.variant_acceptor_context.get_context_origin()
- # Returns the starting position of the acceptor context.
def get_acceptor_context_start(self):
+ """Returns the starting position of the acceptor context.
+
+ Returns
+ -------
+ int
+ Acceptor context leftmost genomic position
+ """
return self.variant_acceptor_context.get_context_start()
- # Returns the ending position of the acceptor context.
def get_acceptor_context_end(self):
+ """Returns the ending position of the acceptor context.
+
+ Returns
+ -------
+ int
+ Acceptor context rightmost genomic position
+ """
return self.variant_acceptor_context.get_context_end()
- # Returns the length of the acceptor context.
def get_acceptor_context_length(self):
+ """Returns the length of the acceptor context.
+
+ Returns
+ -------
+ int
+ Acceptor context length
+ """
return self.variant_acceptor_context.get_context_length()
- # Returns the acceptor context reads.
def get_acceptor_context_reads(self):
+ """Returns the acceptor context reads.
+
+ Returns
+ -------
+ list of DonorBamRead
+ Acceptor context reads
+ """
return self.variant_acceptor_context.get_context_bam_reads()
- # Returns the lost of read ids in the acceptor context.
def get_acceptor_context_read_ids(self):
+ """Returns the acceptor context read identifiers.
+
+ Returns
+ -------
+ list of str
+ Acceptor context read identifiers
+ """
return self.variant_acceptor_context.get_context_bam_read_ids()
- # Returns a list of all acceptor context BAM read start positions.
def get_acceptor_context_read_starts(self):
+ """Returns the leftmost genomic positions of the acceptor context reads
+
+ Returns
+ -------
+ """
return self.variant_acceptor_context.get_context_bam_read_starts()
- # Returns a list of all acceptor context R1 BAM read left positons.
def get_acceptor_context_read_left_positions(self):
+ """Returns the leftmost genomic positions of all R1 acceptor context reads
+
+ Returns
+ -------
+ list of int
+ Acceptor context leftmost genomic R1 read positions
+ """
return self.variant_acceptor_context.get_context_bam_read_left_positions()
- # Returns a list of all acceptor context BAM read end positions.
def get_acceptor_context_read_ends(self):
+ """Returns the rightmost genomic positions of all acceptor context reads.
+
+ Returns
+ -------
+ list of int
+ Acceptor context rightmost genomic read positions.
+ """
return self.variant_acceptor_context.get_context_bam_read_ends()
- # Returns a list of all acceptor context R2 BAM read end positions.
def get_acceptor_context_read_right_positions(self):
+ """Returns a list of all acceptor context R2 BAM read end positions.
+
+ Returns
+ -------
+ list of int
+ Acceptor context rightmost genomic R2 read positions
+ """
return self.variant_acceptor_context.get_context_bam_read_right_positions()
- # Returns a list of all acceptor context BAM read lengths.
def get_acceptor_context_read_lengths(self):
+ """Returns the lengths of the acceptor context reads.
+
+ Returns
+ -------
+ list of int
+ Acceptor context read lengths
+ """
return self.variant_acceptor_context.get_context_bam_read_lengths()
- # Returns the list of acceptor context unmapped mate read ids
def get_acceptor_context_unmapped_mate_ids(self):
+ """Returns the acceptor context read identifiers with unmapped mates.
+
+ Returns
+ -------
+ list of str
+ Acceptor context read identifiers with unmapped mates.
+ """
return self.variant_acceptor_context.get_unmapped_read_mate_ids()
# ===METHODS TO OBTAIN DONOR CONTEXT DATA==================================
- # Returns whether the variant context has a donor context
def has_donor_context(self):
+ """Checks and returns whether the variant context has a donor context saved.
+
+ Returns
+ -------
+ bool
+ True if the variant context has a donor context, False if not
+ """
return self.variant_donor_context is not None
- # Returns the donor context identifier (should be the same as the
- # variant context id).
def get_donor_context_id(self):
+ """Returns the donor context identifier.
+
+ Returns
+ -------
+ str
+ Donor context identifier
+ """
return self.variant_donor_context.get_context_id()
- # Returns the acceptor context sample id (should be the same as the
- # variant context sample id).
def get_donor_context_sample_id(self):
+ """Returns the sample name/identifier of the donor context
+
+ Returns
+ -------
+ str
+ Donor context sample name/identifier
+ """
return self.variant_donor_context.get_sample_id()
- # Returns the chromosome of the donor context.
def get_donor_context_chrom(self):
+ """Returns the chromosome of the donor context.
+
+ Returns
+ -------
+ """
return self.variant_donor_context.get_context_chrom()
- # Returns the origin position of the donor context.
def get_donor_context_origin(self):
+ """Returns the origin position of the donor context.
+
+ Returns
+ -------
+ int
+ Variant genomic position that the context is constructed from.
+ """
return self.variant_donor_context.get_context_origin()
- # Returns the starting position of the donor context.
def get_donor_context_start(self):
+ """Returns the starting position of the donor context.
+
+ Returns
+ -------
+ int
+ Donor context leftmost genomic position
+ """
return self.variant_donor_context.get_context_start()
- # Returns the ending position of the donor context.
def get_donor_context_end(self):
+ """Returns the ending position of the donor context.
+
+ Returns
+ -------
+ int
+ Donor context rightmost genomic position
+ """
return self.variant_donor_context.get_context_end()
- # Returns the length of the acceptor context.
def get_donor_context_length(self):
+ """Returns the length of the donor context.
+
+ Returns
+ -------
+ int
+ Donor context length
+ """
return self.variant_donor_context.get_context_length()
- # Returns the donor context reads.
def get_donor_context_reads(self):
+ """Returns all donor context reads.
+
+ Returns
+ -------
+ list of DonorBamRead
+ Donor context reads
+ """
return self.variant_donor_context.get_context_bam_reads()
- # Returns the donor context read identifers.
def get_donor_context_read_ids(self):
+ """Returns the identifiers of all donor context reads.
+
+ Returns
+ -------
+ list of str
+ Donor context read identifiers
+ """
return self.variant_donor_context.get_context_bam_read_ids()
- # Returns a list of donor context read start positions.
def get_donor_context_read_starts(self):
+ """Returns the leftmost genomic read positions of all donor context reads.
+
+ Returns
+ -------
+ list of int
+ Donor context leftmost genomic read positions
+ """
return self.variant_donor_context.get_context_bam_read_starts()
- # Returns a list of all acceptor context R1 BAM read left positons.
def get_donor_context_read_left_positions(self):
+ """Returns the leftmost genomic positions of all R1 donor context reads.
+
+ Returns
+ -------
+ list of int
+ Donor context leftmost genomic R1 read positions
+ """
return self.variant_donor_context.get_context_bam_read_left_positions()
- # Returns a list of donor context read end positions.
def get_donor_context_read_ends(self):
+ """Returns the rightmost genomic positions of all donor context reads.
+
+ Returns
+ -------
+ list of int
+ Donor context rightmost genomic read positions
+ """
return self.variant_donor_context.get_context_bam_read_ends()
- # Returns a list of all acceptor context R2 BAM read right positons.
def get_donor_context_read_right_positions(self):
+ """Returns the rightmost genomic positions of all R2 donor context reads
+
+ Returns
+ -------
+ list of int
+ Donor context rightmost genomic R2 read positions
+ """
return self.variant_donor_context.get_context_bam_read_right_positions()
- # Returns a list of donor context BAM read lengths.
def get_donor_context_read_lengths(self):
+ """Returns the lengths of the donor context reads.
+
+ Returns
+ -------
+ list of int
+ Donor context read lengths
+ """
return self.variant_donor_context.get_context_bam_read_lengths()
- # Returns the list of donor context unmapped mate read ids.
def get_donor_context_unmapped_mate_ids(self):
+ """Returns the donor context read identifiers that have unmapped mates.
+
+ Returns
+ -------
+ list of str
+ Donor context read identifiers
+ """
return self.variant_donor_context.get_unmapped_read_mate_ids()
# ===METHODS TO PRODUCE SOME OUTPUT ABOUT THE VARIANT CONTEXT==============
- # Returns a varcon.txt string representation of the variant context.
def to_string(self):
+ """Creates and returns the variant context as a String representation.
+
+ The created String representation of the variant context is equal to the entry of a variant context file. Each
+ included data attribute is separated by a tab.
+
+ Returns
+ -------
+ str
+ Variant context as variant context file entry
+ """
if self.variant_context_areads is None:
ad_ratio = "N/A"
list_areads = None
@@ -509,9 +1111,14 @@ def to_string(self):
+ str(list_areads) + "\t"
+ str(list_dreads))
- # Returns a varconstats.txt string representation of the variant
- # context.
def to_statistics_string(self):
+ """Returns a String with basic variant context statistics.
+
+ Returns
+ -------
+ str
+ Basic tab separated variant context statistics
+ """
areadlen = self.get_average_and_median_acceptor_read_length()
dreadlen = self.get_average_and_median_donor_read_length()
areadqual = self.get_average_and_median_acceptor_read_qual()
diff --git a/VariantContextFile.py b/VariantContextFile.py
index 9052e8d..ea90b38 100644
--- a/VariantContextFile.py
+++ b/VariantContextFile.py
@@ -5,6 +5,21 @@
class VariantContextFile:
+ """The VariantContextFile saves variant contexts
+
+ Attributes
+ -------
+ vaselogger
+ The logger displaying information of activities
+ variant_context_file_location : str
+ THe location of a read variant context file
+ variant_contexts : dict
+ Saves variant contexts by context identifier
+ variant_context_statistics
+ varcon_fields : dict
+ Number representation of each variant context data field
+ """
+
def __init__(self, fileloc=None, samplefilter=None,
varconfilter=None, chromfilter=None):
self.vaselogger = logging.getLogger("VaSe_Logger")
@@ -32,48 +47,126 @@ def __init__(self, fileloc=None, samplefilter=None,
# ===METHODS TO GET DATA FROM THE VARIANT CONTEXT FILE=====================
def get_variant_contexts(self, asdict=False):
+ """Returns the variant contexts.
+
+ By default all variant contexts are gathered in a list and returned. If the asdict parameter is set t True, the
+ variant contexts are returned as a dictionary.
+
+ Returns
+ -------
+ list or dict
+ Variant context as list by default, as dict when parameter is set to True
+ """
if asdict:
return self.variant_contexts
return [varcon for varcon in self.variant_contexts.values()]
- # Returns the variant contexts by sample identifier
def get_variant_contexts_by_sampleid(self):
+ """Gathers variant contexts, sorts them by sample identifiers and returns them.
+
+ Returns
+ -------
+ dict
+
+ """
varcons = self.get_variant_contexts()
return {x.get_variant_context_sample(): [y for y in varcons if y.get_variant_context_sample() ==
x.get_variant_context_sample()] for x in varcons}
- # Returns the number of contexts saved
def get_number_of_contexts(self):
+ """Determines and returns the number of variant contexts.
+
+ Returns
+ -------
+ int
+ Number of variant contexts
+ """
return len(self.variant_contexts)
- # Returns a list of context ids
def get_variant_context_ids(self):
+ """Returns the variant context identifiers.
+
+ Returns
+ -------
+ list of str
+ Context identifiers of all variant contexts
+ """
return [x for x in self.variant_contexts.keys()]
- # Returns a specified variant context.
def get_variant_context(self, contextid):
+ """Checks whether a variant context exists and returns it if so.
+
+ Parameters
+ ----------
+ contextid : str
+ Identifier of context to obtain
+
+ Returns
+ -------
+ VariantContext or None
+ VariantContext if it exists, None if not
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid]
return None
- # Returns whether a variant context is present
def has_variant_context(self, contextid):
+ """Checks and returns whether a specified variant context has a variant context.
+
+ Parameters
+ ----------
+ contextid : str
+ Context identifier to check
+
+ Returns
+ -------
+ bool
+ True if the variant context is present, False if not
+ """
return contextid in self.variant_contexts
- # Returns the acceptor context of the specified variant context.
def get_acceptor_context(self, contextid):
+ """Fetches and returns a specified acceptor context.
+
+ Parameters
+ ----------
+ contextid : str
+ Context identifier of acceptor context to return
+
+ Returns
+ -------
+ OverlapContext or None
+ The acceptor context if it exists, None if not
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_acceptor_context()
return None
- # Returns the donor context of the specified variant context.
def get_donor_context(self, contextid):
+ """Fetches and returns a specified donor context.
+
+ Parameters
+ ----------
+ contextid : str
+ Context identifier of donor context to return
+
+ Returns
+ -------
+ OverlapContext or None
+ The donor context if it exists, None if not
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_donor_context()
return None
- # Returns all variant context acceptor reads.
def get_all_variant_context_acceptor_reads(self):
+ """Gathers and returns the acceptor reads of all variant contexts
+
+ Returns
+ -------
+ list of DonorBamRead
+ Acceptor reads of all variant contexts
+ """
acceptorreads = []
for varcon in self.variant_contexts.values():
acceptorreads.extend(varcon.get_acceptor_reads())
@@ -95,6 +188,13 @@ def get_all_variant_context_acceptor_reads(self):
# return uniqdonorreads
# =============================================================================
def get_all_variant_context_donor_reads(self):
+ """Collects and returns all variant context donor reads.
+
+ Returns
+ -------
+ list of DonorBamRead
+ Variant context donor reads
+ """
dbrs = []
donorreads = []
for varcon in self.variant_contexts.values():
@@ -106,55 +206,133 @@ def get_all_variant_context_donor_reads(self):
dbr.get_bam_read_qual()))
return list(set(donorreads))
- # Returns all variant context acceptor read ids.
def get_all_variant_context_acceptor_read_ids(self):
+ """Returns all variant context acceptor read identifiers.
+
+ Returns
+ -------
+ acceptorreadids : list of str
+ Variant context acceptor read identifiers
+ """
acceptorreadids = []
for varcon in self.variant_contexts.values():
acceptorreadids.extend(varcon.get_acceptor_read_ids())
return acceptorreadids
- # Returns the variant context donor read ids.
def get_all_variant_context_donor_read_ids(self):
+ """Returns all variant context donor read identifiers.
+
+ Returns
+ -------
+ donorreadids : list of str
+ Variant context donor read identifiers
+ """
donorreadids = []
for varcon in self.variant_contexts.values():
donorreadids.extend(varcon.get_donor_read_ids())
return donorreadids
- # Returns the variant context field data
def get_variant_context_fields(self):
+ """Returns the number representations of the variant context data.
+
+ Returns
+ -------
+ self.varcon_fields : dict
+ Number representation of each variant context field
+ """
return self.varcon_fields
# ===METHODS TO OBTAIN VARIANT CONTEXT DATA=============================
def get_variant_context_areads(self, contextid):
+ """Returns the variant context acceptor reads for a specified variant context.
+
+ Parameters
+ ----------
+ contextid : str
+ Variant context identifier
+
+ Returns
+ -------
+ list of DonorBamRead
+ Variant context acceptor reads, empty list of context does not exist
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_acceptor_reads()
return []
- # Returns the donor reads of a specified variant context
def get_variant_context_dreads(self, contextid):
+ """Returns the variant context donor reads for a specified variant context.
+
+ Parameters
+ ----------
+ contextid : str
+ Variant context identifier
+
+ Returns
+ -------
+ list of DonorBamRead
+ Variant context donor reads, empty list if context does not exist
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_donor_reads()
return []
- # Returns the acceptor context reads
def get_acceptor_context_reads(self, contextid):
+ """Returns the reads of a specified acceptor context.
+
+ Parameters
+ ----------
+ contextid : str
+ Acceptor context identifier
+
+ Returns
+ -------
+ list of DonorBamRead
+ Acceptor context reads
+ """
if contextid in self.variant_contexts:
if self.variant_contexts[contextid].has_acceptor_context():
return self.variant_contexts[contextid].get_acceptor_context_reads()
return []
- # Returns the reads of the donor context
def get_donor_context_reads(self, contextid):
+ """Returns the reads of a specified donor context.
+
+ Parameters
+ ----------
+ contextid : str
+ Donor context identifier
+
+ Returns
+ -------
+ list of DonorBamRead
+ Donor context reads, empty list if context does not exist
+ """
if contextid in self.variant_contexts:
if self.variant_contexts[contextid].has_donor_context():
return self.variant_contexts[contextid].get_donor_context_reads()
return []
# ===BASIC VARIANTCONTEXTFILE METHODS======================================
- # Reads a provided variant context file and saves data according to
- # set filters.
def read_variant_context_file(self, fileloc, samplefilter=None,
- IDfilter=None, chromfilter=None):
+ idfilter=None, chromfilter=None):
+ """Reads a provided variant context file and saves the data.
+
+ Filter can be set for reading the variant context file. The sample filter can nbe used to specify which samples
+ to save. Samples not in the samplefilter will be skipped. Similarly filter for variant contexts and chromosome
+ names can be set.
+
+ Parameters
+ ----------
+ fileloc : str
+ Path to variant context file to read
+ samplefilter : list of str
+ Sample names/identifiers to include
+ idfilter : list of str
+ Variant contexts to include
+ chromfilter : list of str
+ Chromosome names to include
+ """
try:
with open(fileloc, "r") as vcfile:
varcon_records = vcfile.readlines()
@@ -168,7 +346,7 @@ def read_variant_context_file(self, fileloc, samplefilter=None,
if record[0] in self.variant_contexts:
continue
- IDpass = self.passes_filter(record[0], IDfilter)
+ IDpass = self.passes_filter(record[0], idfilter)
samplepass = self.passes_filter(record[1], samplefilter)
chrompass = self.passes_filter(record[2], chromfilter)
if not (IDpass and samplepass and chrompass):
@@ -179,8 +357,20 @@ def read_variant_context_file(self, fileloc, samplefilter=None,
new_varcon = VariantContext(*record[:6], acceptor_reads, donor_reads)
self.variant_contexts[record[0]] = new_varcon
- # Reads an acceptor context file
def read_acceptor_context_file(self, accconfileloc, samplefilter=None, contextfilter=None, chromfilter=None):
+ """Reads a provided acceptor context file and adds the acceptor contexts to already existing variant contexts.
+
+ Parameters
+ ----------
+ accconfileloc : str
+ Path to the acceptor contexts file
+ samplefilter : list of str
+ Sample names/identifiers to include
+ contextfilter : list of str
+ Acceptor contexts to include
+ chromfilter : list of str
+ Chromosome names to include
+ """
try:
with open(accconfileloc, "r") as accconfile:
next(accconfile) # Skip the header line
@@ -205,6 +395,19 @@ def read_acceptor_context_file(self, accconfileloc, samplefilter=None, contextfi
# Reads a donor context file
def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilter=None, chromfilter=None):
+ """Reads a provided donor contexts file and adds the donor contexts to already existing variant contexts.
+
+ Parameters
+ ----------
+ donconfileloc : str
+ Path to the donr contexts file
+ samplefilter : list of str
+ Sample names/identifiers to include
+ contextfilter : list of str
+ Donor contexts to include
+ chromfilter : list of str
+ Chromosome names to include
+ """
try:
with open(donconfileloc, "r") as donconfile:
next(donconfile) # Skip the header line
@@ -227,45 +430,121 @@ def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilte
except IOError:
self.vaselogger.warning(f"Could not read donor context file: {donconfileloc}")
- # Returns whether something is in the filter or not.
def passes_filter(self, valtocheck, filterlist):
+ """Tests and returns whether a provided value is in a provided filter.
+
+ Filters are expected to be inclusion filters, denoting a list of values that should be used/included.
+
+ Parameters
+ ----------
+ valtocheck : str
+ Value to check against a filter list
+ filterlist : list of str
+ Values to filter with
+
+ Returns
+ -------
+ bool
+ True if value is in filter, False otherwise
+ """
if filterlist is not None:
return valtocheck in filterlist
return True
# ===VARIANT CONTEXT SET OPERATIONS (UNION, INTERSECT, DIFFERENCE)=========
- # Returns the list of variant context identifiers from both variant
- # context files.
def get_variant_contexts_union(self, other_varcon_file):
+ """Returns all variant context identifiers from both variant context files.
+
+ Parameters
+ ----------
+ other_varcon_file : VariantContextFile
+ A VariantContextFile object with VariantContexts
+
+ Returns
+ -------
+ list of str
+ All variant context identifiers of both VariantContextFile objects
+ """
own_varcon_ids = self.get_variant_context_ids()
other_varcon_ids = other_varcon_file.get_variant_context_ids()
return list(set(own_varcon_ids) | set(other_varcon_ids))
- # Returns the list of variant context identifiers present in both
- # variant context files.
def get_variant_contexts_intersect(self, other_varcon_file):
+ """Returns variant context identifiers present in both variant context files.
+
+ Parameters
+ ----------
+ other_varcon_file : VariantContextFile
+ VariantContextFile with VariantContext objects
+
+ Returns
+ -------
+ list of str
+ Shared variant context identifiers
+ """
own_varcon_ids = self.get_variant_context_ids()
other_varcon_ids = other_varcon_file.get_variant_context_ids()
return list(set(own_varcon_ids) & set(other_varcon_ids))
- # Returns the list of variant context identifiers in this file but
- # not present in the other variant context file.
def get_variant_contexts_difference(self, other_varcon_file):
+ """Returns the variant context identifiers not present in the other variant context file.
+
+ Parameters
+ ----------
+ other_varcon_file : VariantContextFile
+ VariantContextFile with variant contexts
+
+ Returns
+ -------
+ list of str
+ Variant context identifiers not present in the other variant context file
+ """
own_varcon_ids = self.get_variant_context_ids()
other_varcon_ids = other_varcon_file.get_variant_context_ids()
return list(set(own_varcon_ids) - set(other_varcon_ids))
- # Returns the list of variant context identifiers only present in
- # one of the variant context file but not the other.
def get_variant_contexts_symmetric_difference(self, other_varcon_file):
+ """Determines the variant context identifiers present in either one or the other variant context file.
+
+ THe normal difference only returns the context identifiers in A but not in B. The symmetric difference returns
+ the context identifiers in A but not in B as well as context identifiers in B but not in A.
+
+ Parameters
+ ----------
+ other_varcon_file : VariantContextFile
+
+ Returns
+ -------
+ list of str
+ Variant context read identifiers
+ """
own_varcon_ids = self.get_variant_context_ids()
other_varcon_ids = other_varcon_file.get_variant_context_ids()
return list(set(own_varcon_ids) ^ set(other_varcon_ids))
- # ===METHODS TO OBTAIN VARIANT CONTEXT DATA BASED ON FILTERS===============
- # Returns a list/hashmap of VariantContextObjects.
+ # ===METHODS TO OBTAIN VARIANT CONTEXT DATA BASED ON FILTERS===============.
def get_variant_contexts2(self, aslist=False, varconfilter=None,
samplefilter=None, chromfilter=None):
+ """Returns the variant contexts in the VariantContextFile.
+
+ Inclusion filters can be set to subset the returned variant contexts. FIlters include sample names/identifiers,
+ variant context identifiers and chromosome names. Variant contexts can be returned as list or dictionary.
+
+ Parameters
+ ----------
+ aslist : bool
+ Return variant context in a list
+ varconfilter : list of str
+ Contexts to include
+ samplefilter : list of str
+ Samples to include
+ chromfilter : list of str
+
+ Returns
+ -------
+ list or dict of VariantContext
+ Returns variant contexts as list if aslist set to True, dict otherwise
+ """
if aslist:
return [
x for x in self.variant_contexts.values()
@@ -286,9 +565,30 @@ def get_variant_contexts2(self, aslist=False, varconfilter=None,
}
# ====METHODS TO ASSESS WHETHER A VARIANT IS IN AN EXISTING CONTEXT========
- # Main method that returns whether a variant (SNP or indel).
def variant_is_in_context(self, varianttype, searchchrom,
searchstart, searchstop):
+ """Returns whether a variant is located in an already existing context.
+
+ If the variant type is set to 'snp' the method will check and return whether the SNP overlaps with a variant
+ context. If the variant is set to 'indel' the method will check and return wheterh the area from start to end
+ of the indel overlaps with a variant context.
+
+ Parameters
+ ----------
+ varianttype : str
+ Type of variant (snp/indel)
+ searchchrom : str
+ Chromosome name the variant is located on
+ searchstart : int
+ Leftmost genomic search window position
+ searchstop : int
+ Rightmost genomic search window position
+
+ Returns
+ -------
+ bool or None
+ True if the variant overlaps with a context, False if not
+ """
if varianttype == "snp":
return self.snp_variant_is_in_context(searchchrom, searchstart)
if varianttype == "indel":
@@ -296,9 +596,25 @@ def variant_is_in_context(self, varianttype, searchchrom,
searchstop)
return None
- # Determines whether an SNP variant is located in an already
- # existing variant context.
+ # Determines whether an SNP variant is located in an already existing variant context.
def snp_variant_is_in_context(self, varchrom, vcfvarpos):
+ """Checks and returns whether a SNP is located in an already existing variant context.
+
+ For each variant context it is first checked whether the chromosome name of the context and SNP are the same. If
+ so, it is then checked whether the SNP genomic position is equal or larger than the context start and smaller or
+ equal than the context end.
+
+ Parameters
+ ----------
+ varchrom : str
+ Chromosome name the SNP is located on
+ vcfvarpos : int
+ Genomic position of the SNP
+ Returns
+ -------
+ bool
+ True if the SNP is in a variant context, False if not
+ """
for varcon in self.variant_contexts.values():
if varchrom == varcon.get_variant_context_chrom():
if (vcfvarpos >= varcon.get_variant_context_start()
@@ -306,10 +622,26 @@ def snp_variant_is_in_context(self, varchrom, vcfvarpos):
return True
return False
- # Determines whether an indel variant is located within an existing
- # variant context (indelLeftPos and indelRightPos can be the used
- # search window).
def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos):
+ """Checks and returns whether an indel is located in an already existing variant context.
+
+ For each variant context it is first checked whether the chromosome name of the context and indel are the same.
+ If so, it is than checked whether the indel range from indel start to end overlaps with the context.
+
+ :Parameters
+ -----------
+ indelchrom : str
+ The chromosome name the indel is located on
+ indelleftpos : int
+ The leftmost genomic position of the indel
+ indelrightpos : int
+ The rightmost genomic position of the indel
+
+ Returns
+ -------
+ bool
+ True if the indel overlaps with a variant context, False if not
+ """
for varcon in self.variant_contexts.values():
if (indelchrom == varcon.get_variant_context_chrom()):
if (indelleftpos <= varcon.get_variant_context_start()
@@ -323,8 +655,19 @@ def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos):
return True
return False
- # Checks whether a context is located in an already existing context
def context_collision(self, context_arr):
+ """Checks and returns whether a potential context overlaps with an already existing variant context.
+
+ Parameters
+ ----------
+ context_arr: list
+ Essential context data (chrom, variant pos, start, end)
+
+ Returns
+ -------
+ bool
+ True if provided context overlaps with an already existing context, False if not
+ """
if f"{context_arr[0]}_{context_arr[1]}" in self.variant_contexts:
return True
else:
@@ -336,16 +679,49 @@ def context_collision(self, context_arr):
return False
# ===METHODS TO ADD DATA/VARIANT CONTEXTS TO THE VARIANT CONTEXT FILE======
- # Sets a variant context object to a specified context id.
def set_variant_context(self, varconid, varcontext):
+ """Sets a provided variant context to the provided context identifier. Will overwrite the previous value for
+ the provided context identifier.
+
+ Parameters
+ ----------
+ varconid : str
+ Identifier of the variant context to add
+ varcontext: VariantContext
+ The VariantContext to set
+ """
self.variant_contexts[varconid] = varcontext
- # Adds a variant context object
def add_variant_context(self, varconid, sampleid,
varconchrom, varconorigin,
varconstart, varconend,
varcon_areads, varcon_dreads,
acceptor_context=None, donor_context=None):
+ """Constructs a VariantContext from the provided parameters and adds it.
+
+ Parameters
+ ----------
+ varconid : str
+ Identifier of the variant context
+ sampleid : str
+ Identifier of the sample
+ varconchrom : str
+ Chromosome name of the variant context
+ varconorigin : int
+ The variant position the context is based on
+ varconstart : int
+ Leftmost genomic position of the variant context
+ varconend : int
+ Rightmost genomic position of the variant context
+ varcon_areads : list of DonorBamRead
+ Variant context acceptor reads
+ varcon_dreads : list of DonorBamRead
+ Variant contect donor reads
+ acceptor_context : OverlapContext
+ Acceptor context belonging to the variant context
+ donor_context : OverlapContext
+ Donor context belonging to the variant context
+ """
varcon_obj = VariantContext(varconid, sampleid,
varconchrom, varconorigin,
varconstart, varconend,
@@ -353,17 +729,58 @@ def add_variant_context(self, varconid, sampleid,
acceptor_context, donor_context)
self.variant_contexts[varconid] = varcon_obj
- # Adds an acceptor context object to a variant context.
+ def add_existing_variant_context(self, varconid, varconobj):
+ """Add an already created variant context to the variant context file.
+
+ The variant context to add should be a valid VariantContext object. It will be added to the variant context
+ file under the context identifier.
+
+ Parameters
+ ----------
+ varconid : str
+ Identifier of the context
+ varconobj : VariantContext
+ The created variant context
+ """
+ if varconobj is not None:
+ self.variant_contexts[varconid] = varconobj
+
def set_acceptor_context(self, varconid, acceptor_context):
+ """Adds an existing acceptor context to a variant context.
+
+ Parameters
+ ----------
+ varconid : str
+ Context identifier
+ acceptor_context : OverlapContext
+ The acceptor context to add
+ """
if varconid in self.variant_contexts:
self.variant_contexts[varconid].set_acceptor_context(acceptor_context)
- # Add a newly created acceptor context to an existing variant
- # context.
def add_acceptor_context(self, contextid, sampleid,
contextchrom, contextorigin,
contextstart, contextend,
acceptorreads):
+ """Constructs and adds an acceptor context to a specified variant context.
+
+ Parameters
+ ----------
+ contextid : str
+ Context identifier
+ sampleid: str
+ Sample name/identifier
+ contextchrom: str
+ Chromosome name the context is located on
+ contextorigin: int
+ Variant genomic position the context is based on
+ contextstart: int
+ Leftmost genomic position of the context
+ contextend: int
+ Rightmost genomic position of the context
+ acceptorreads: list of DonorBamRead
+ List of reads associated with the context
+ """
if contextid in self.variant_contexts:
self.variant_contexts[contextid].add_acceptor_context(
contextid, sampleid,
@@ -371,16 +788,42 @@ def add_acceptor_context(self, contextid, sampleid,
contextstart, contextend,
acceptorreads)
- # Sets the donor context object of a variant context
def set_donor_context(self, varconid, donor_context):
+ """Adds an already existing donor context to a specified variant context.
+
+ Parameters
+ ----------
+ varconid : str
+ Context identifier
+ donor_context : OverlapContext
+ Donor context to add
+ """
if varconid in self.variant_contexts:
self.variant_contexts[varconid].set_donor_context(donor_context)
- # Add a newly created donor context to an existing variant context.
def add_donor_context(self, contextid, sampleid,
contextchrom, contextorigin,
contextstart, contextend,
donorreads):
+ """Constructs a donor context from the provided parameters and adds it to a specified variant context.
+
+ Parameters
+ ----------
+ contextid : str
+ Variant context identifier
+ sampleid : str
+ Sample name/identifier
+ contextchrom : str
+ Chromosome name the context is located on
+ contextorigin : int
+ Variant position that constructed the context
+ contextstart : int
+ Donor context leftmost genomic position
+ contextend : int
+ Donor context rightmost genomic position
+ donorreads : list of DonorBamRead
+ Donor context reads
+ """
if contextid in self.variant_contexts:
self.variant_contexts[contextid].add_donor_context(
contextid, sampleid,
@@ -388,95 +831,221 @@ def add_donor_context(self, contextid, sampleid,
contextstart, contextend,
donorreads)
- # ===METHODS TO ADD UNMAPPED MATE IDS TO ACCEPTOR, DONOR AND VARIANT=======
- # ===CONTEXT===
- # Sets the unmapped mate read id for a specified acceptor context.
+ # ===METHODS TO ADD UNMAPPED MATE IDS TO ACCEPTOR, DONOR AND VARIANT CONTEXT=======
def set_acceptor_context_unmapped_mate_ids(self, contextid, mateids):
+ """Sets the read identifiers that have unmapped read mates fo a specified acceptor context.
+
+ Parameters
+ ----------
+ contextid : str
+ Acceptor context to set unmapped read identifiers for
+ mateids : lit of str
+ Read identifiers with unmapped mates
+ """
if contextid in self.variant_contexts:
self.variant_contexts[contextid].set_acceptor_context_unmapped_mates(mateids)
- # Sets the unmapped mate read id for a specified donor context.
def set_donor_context_unmapped_mate_ids(self, contextid, mateids):
+ """Sets the read identifiers that have unmapped read mates for a specified donor context.
+
+ Parameters
+ ----------
+ contextid : str
+ Donor context to set unmapped read identifiers for
+ mateids : list of str
+ Donor context read identifiers with unmapped mates
+ """
if contextid in self.variant_contexts:
self.variant_contexts[contextid].set_donor_context_unmapped_mates(mateids)
- # Sets the acceptor unmapped mate ids for a specified variant
- # context.
def set_unmapped_acceptor_mate_ids(self, contextid, mateids):
+ """Sets the acceptor read identifiers that have unmapped mates for a specified variant context.
+
+ Parameters
+ ----------
+ contextid : str
+ Variant contexct identifier to set unmapped acceptor read identifers for
+ mateids : list of str
+ Variant context acceptor read identifiers with unmapped mates
+ """
if contextid in self.variant_contexts:
self.variant_contexts[contextid].set_unmapped_acceptor_mate_ids(mateids)
- # Sets the donor unmapped mate ids for a specified variant context.
def set_unmapped_donor_mate_ids(self, contextid, mateids):
+ """Sets the donor read identifiers that have unmapped mates for a specified variant context.
+
+ Parameters
+ ----------
+ contextid : str
+ Variant context identifier to set unmapped donor read identifiers for
+ mateids: list of str
+ Variant context donor read identifiers with unmapped mates
+ """
if contextid in self.variant_contexts:
self.variant_contexts[contextid].set_unmapped_donor_mate_ids(mateids)
- # ===METHODS TO GET UNMAPPED MATE IDS OF AN ACCEPTOR, DONOR AND============
- # ===VARIANT CONTEXT===
- # Return the unmapped read mate identifiers of a specified acceptor
- # context.
+ # ===METHODS TO GET UNMAPPED MATE IDS OF AN ACCEPTOR, DONOR AND VARIANT CONTEXT============
def get_acceptor_context_unmapped_mate_ids(self, contextid):
+ """Returns the acceptor context read identifiers that have unmapped read mates.
+
+ Parameters
+ ----------
+ contextid : str
+ Acceptor context identifier
+
+ Returns
+ -------
+ list of str
+ Acceptor context read identifiers, empty list if context does not exist
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_acceptor_context_unmapped_mate_ids()
return []
- # Returns the unmapped read mate identifiers of a specified donor
- # context.
def get_donor_context_unmapped_mate_ids(self, contextid):
+ """Returns the donor context read identifiers that have unmapped read mates.
+
+ Parameters
+ ----------
+ contextid : str
+ Donor context identifier
+
+ Returns
+ -------
+ list of str
+ Donor contexts read identifiers, empty list if context does not exist
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_donor_context_unmapped_mate_ids()
return []
- # Returns the unmapped acceptor read mate identifiers of a specified
- # variant context.
def get_unmapped_acceptor_mate_ids(self, contextid):
+ """Returns the variant context acceptor read identifiers that have unmapped read mates.
+
+ Parameters
+ ----------
+ contextid : str
+ Variant context identifier
+
+ Returns
+ -------
+ list of str
+ Variant context acceptor read identiifers, empty list if context does not exist
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_unmapped_acceptor_read_ids()
return []
- # Returns the unmapped donor read mate identifiers of a specified
- # variant context.
def get_unmapped_donor_mate_ids(self, contextid):
+ """Returns the variant context donor read identifiers that have unmapped read mates.
+
+ Parameters
+ ----------
+ contextid : str
+
+ Returns
+ -------
+ list of str
+ Variant context donor read identifiers, empty list if context does not exist
+ """
if contextid in self.variant_contexts:
return self.variant_contexts[contextid].get_unmapped_donor_read_ids()
return []
# ===METHODS TO OBTAIN SOME STATISTICS ABOUT ALL THE CONTEXTS==============
- # Returns the average variant context length within this variant
- # context file.
def get_average_variant_context_length(self):
+ """Calculates and returns the mean variant context length.
+
+ The mean length is calculated over all variant contexts in the variant context file. Acceptor and donor contexts
+ associated with the variant contexts are not used.
+
+ Returns
+ -------
+ float
+ Average variant context length
+ """
return statistics.mean([varcon.get_variant_context_length()
for varcon in self.variant_contexts.values()])
- # Returns the median variant context length within this variant
- # context file.
def get_median_variant_context_length(self):
+ """Calculates and returns the median variant context length.
+
+ The median length is calculated over all variant contexts in the variant context file. Acceptor and donor
+ contexts associated with the variant contexts are not used.
+
+ Returns
+ -------
+ float
+ Median variant context length
+ """
return statistics.median([varcon.get_variant_context_length()
for varcon in self.variant_contexts.values()])
- # Returns the average number of variant context acceptor reads
def get_average_variant_context_acceptor_reads(self):
+ """Calculates and returns the average number of variant context acceptor reads.
+
+ The mean number of acceptor reads is calculated over all variant contexts.
+
+ Returns
+ -------
+ float
+ Mean number of variant context acceptor reads
+ """
return statistics.mean([varcon.get_number_of_acceptor_reads() for varcon in self.variant_contexts.values()])
- # Returns the average number of variant context donor reads
def get_average_variant_context_donor_reads(self):
+ """Calculates and returns the average number of variant context donor reads.
+
+ The mean number of donor reads is calculated over all variant contexts.
+
+ Returns
+ -------
+ float
+ Mean number of variant context donor reads
+ """
return statistics.mean([varcon.get_number_of_donor_reads() for varcon in self.variant_contexts.values()])
- # Returns the median number of variant context acceptor reads
def get_median_variant_context_acceptor_reads(self):
+ """Calculates and returns the median variant context acceptor reads.
+
+ Returns
+ -------
+ float
+ Median number of variant context acceptor reads
+ """
return statistics.median([varcon.get_number_of_acceptor_reads() for varcon in self.variant_contexts.values()])
- # Returns the median number of variant context donor reads
def get_median_variant_context_donor_reads(self):
+ """Calculates and returns the median number of variant context donor reads.
+
+ Returns
+ -------
+ float
+ Median number of variant context donor reads
+ """
return statistics.median([varcon.get_number_of_donor_reads() for varcon in self.variant_contexts.values()])
# ===METHODS TO WRITE VARIANT CONTEXT DATA TO A FILE=======================
- # Writes the variant context data to an output file.
- def write_variant_context_file(self, outfileloc, samplefilter=None,
+ def write_variant_context_file(self, outfileloc, vbuuid, samplefilter=None,
varconfilter=None, chromfilter=None):
+ """Writes the data saved in the variant contexts to an output file.
+
+ Parameters
+ ----------
+ outfileloc : str
+ Path to write the variant context file to
+ vbuuid : str
+ VaSeBuilder unique identifier that build the VariantContextFile
+ samplefilter : list of str
+ Sample names/identifiers to include
+ varconfilter : list of str
+ Variant contexts to include
+ chromfilter : list of str
+ Chromosome names to include
+ """
try:
-
with open(outfileloc, "w") as varcon_outfile:
+ varcon_outfile.write(f"#VBUUID: {vbuuid}\n")
varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t"
"Start\tEnd\tAcceptorContextLength\t"
"DonorContextLength\tAcceptorReads\t"
@@ -501,13 +1070,26 @@ def write_variant_context_file(self, outfileloc, samplefilter=None,
self.vaselogger.warning("Could not write variant contexts to "
f"{ioe.filename}")
- # Writes the donor contexts used to construct the variant contexts.
- def write_acceptor_context_file(self, outfileloc, samplefilter=None,
+ def write_acceptor_context_file(self, outfileloc, vbuuid, samplefilter=None,
contextfilter=None, chromfilter=None):
+ """Writes the acceptor contexts associated with variants contexts to an output file.
+
+ Parameters
+ ----------
+ outfileloc: str
+ Path and name to write acceptor contexts to
+ samplefilter: list of str
+ Sample names/identifiers to include
+ contextfilter: list of str
+ Acceptor contexts to include
+ chromfilter: list of str
+ Chromosome names to include
+ """
try:
- with open(outfileloc, "w") as varcon_oufile:
- varcon_oufile.write("#ContextId\tDonorSample\tChrom\tOrigin\t"
- "Start\tEnd\tNumOfReads\tReadIds\n")
+ with open(outfileloc, "w") as varcon_outfile:
+ varcon_outfile.write(f"#VBUUID: {vbuuid}\n")
+ varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t"
+ "Start\tEnd\tNumOfReads\tReadIds\n")
for varcon in self.variant_contexts.values():
samplepass = self.passes_filter(
varcon.get_variant_context_sample(),
@@ -522,19 +1104,29 @@ def write_acceptor_context_file(self, outfileloc, samplefilter=None,
chromfilter
)
if samplepass and varconpass and chrompass:
- varcon_oufile.write(
- varcon.get_acceptor_context().to_string() + "\n"
- )
+ varcon_outfile.write(varcon.get_acceptor_context().to_string() + "\n")
except IOError as ioe:
self.vaselogger.warning("Could not write acceptor contexts to "
f"{ioe.filename}")
- # Writes the acceptor cotnexts used to construct the variant
- # contexts.
- def write_donor_context_file(self, outfileloc, samplefilter=None,
+ def write_donor_context_file(self, outfileloc, vbuuid, samplefilter=None,
contextfilter=None, chromfilter=None):
+ """Writes the donor contexts associated with variant context to an output file.
+
+ Parameters
+ ----------
+ outfileloc : str
+ Path and name to write the donor contexts to
+ samplefilter : list of str
+ Sample names/identifiers to include
+ contextfilter : list of str
+ Donor contexts to include
+ chromfilter : list of str
+ Chromosome names to include
+ """
try:
with open(outfileloc, "w") as varcon_outfile:
+ varcon_outfile.write(f"#VBUUID: {vbuuid}\n")
varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t"
"Start\tEnd\tNumOfReads\tReadIds\n")
for varcon in self.variant_contexts.values():
@@ -558,11 +1150,19 @@ def write_donor_context_file(self, outfileloc, samplefilter=None,
self.vaselogger.warning("Could not write donor contexts to "
f"{ioe.filename}")
- # Writes some statistics about the acceptor and donor reads
- # identified for each variant context.
- def write_variant_context_stats(self, statsoutloc):
+ def write_variant_context_stats(self, statsoutloc, vbuuid):
+ """Writes basic statistics of the variant contexts to a specified output file.
+
+ Basic statistics include means and medians of read lengths, Q-Score and mapping quality.
+
+ Parameters
+ ----------
+ statsoutloc : str
+ Path and name to write variant context statistics output file to
+ """
try:
with open(statsoutloc, "w") as varcon_statsfile:
+ varcon_statsfile.write(f"#VBUUID: {vbuuid}\n")
varcon_statsfile.write("#ContextId\tAvg_ALen\tAvg_DLen\t"
"Med_ALen\tMed_DLen\tAvg_AQual\t"
"Avg_DQual\tMed_AQual\tMed_DQual\t"
@@ -574,11 +1174,19 @@ def write_variant_context_stats(self, statsoutloc):
self.vaselogger.critical("Could not write variant context "
f"statistics to {statsoutloc}")
- # Writes some statistics about the acceptor and donor reads
- # identified for each variant context.
- def write_acceptor_context_stats(self, statsoutloc):
+ def write_acceptor_context_stats(self, statsoutloc, vbuuid):
+ """Writes basic statistics of the acceptor contexts to a specified output file.
+
+ Basic statistics include means and medians of read lengths, Q-Score and mapping quality.
+
+ Parameters
+ ----------
+ statsoutloc: str
+ Path and name to write the acceptor context statistics file to
+ """
try:
with open(statsoutloc, "w") as varcon_statsfile:
+ varcon_statsfile.write(f"#VBUUID: {vbuuid}\n")
varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t"
"Avg_ReadQual\tMed_ReadQual\t"
"Avg_ReadMapQ\tMed_ReadMapQ\n")
@@ -591,11 +1199,20 @@ def write_acceptor_context_stats(self, statsoutloc):
self.vaselogger.critical("Could not write acceptor context "
f"statistics to {statsoutloc}")
- # Writes some statistics about the acceptor and donor reads
- # identified for each variant context.
- def write_donor_context_stats(self, statsoutloc):
+ # Writes some statistics about the acceptor and donor reads identified for each variant context.
+ def write_donor_context_stats(self, statsoutloc, vbuuid):
+ """Writes basic statistics of the donor contexts to a specified output file.
+
+ Basic statistics include means and medians of read lengths, Q-Score and mapping quality.
+
+ Parameters
+ ----------
+ statsoutloc : str
+ Path and name to write the donor context statistics to
+ """
try:
with open(statsoutloc, "w") as varcon_statsfile:
+ varcon_statsfile.write(f"#VBUUID: {vbuuid}\n")
varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t"
"Avg_ReadQual\tMed_ReadQual\t"
"Avg_ReadMapQ\tMed_ReadMapQ\n")
@@ -608,11 +1225,21 @@ def write_donor_context_stats(self, statsoutloc):
self.vaselogger.critical("Coud not write donor context statistics "
f"to {statsoutloc}")
- # Writes the left and right positions to the output file. Left pos
- # for R1 and right pos for R2.
- def write_left_right_positions(self, typetowrite, outfileloc):
+ def write_left_right_positions(self, typetowrite, outfileloc, vbuuid):
+ """Writes variant context leftmost and rightmost genomic read positions to an output file.
+
+ The leftmost genomic positions of R1 reads and rightmost genomic positions for R2 reads are written to file.
+
+ Parameters
+ ----------
+ typetowrite : str
+ Type (acceptor/donor) of file to write
+ outfileloc : str
+ Path and name to write the output file to
+ """
try:
with open(outfileloc, "w") as lrpof:
+ lrpof.write(f"#VBUUID: {vbuuid}")
lrpof.write("#ContextId\tLeftPos\tRightPos\n")
for varcon in self.variant_contexts.values():
leftpositions, rightpositions = [], []
@@ -641,11 +1268,19 @@ def write_left_right_positions(self, typetowrite, outfileloc):
self.vaselogger.warning("Could not write read left positions to "
f"output file {outfileloc}")
- # Writes the acceptor context left and right positions to the output
- # file. Left pos for R1 and right pos for R2.
- def write_acceptor_left_right_positions(self, outfileloc):
+ def write_acceptor_left_right_positions(self, outfileloc, vbuuid):
+ """Writes acceptor context leftmost and rightmost genomic read positions to an output file.
+
+ The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file.
+
+ Parameters
+ ----------
+ outfileloc : str
+ Path and name to write the output file to
+ """
try:
with open(outfileloc, "w") as lrpof:
+ lrpof.write(f"#VBUUID: {vbuuid}\n")
lrpof.write("#ContextId\tLeftPos\tRightPos\n")
for varcon in self.variant_contexts.values():
leftpositions = [
@@ -663,11 +1298,19 @@ def write_acceptor_left_right_positions(self, outfileloc):
self.vaselogger.warning("Could not write read left positions to "
f"output file {outfileloc}")
- # Writes the left and right positions to the output file. Left pos
- # for R1 and right pos for R2.
- def write_donor_left_right_positions(self, outfileloc):
+ def write_donor_left_right_positions(self, outfileloc, vbuuid):
+ """Writes donor context leftmost and rightmost genomic read positions to an output file.
+
+ The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file.
+
+ Parameters
+ ----------
+ outfileloc : str
+ Path and name to write the output file to
+ """
try:
with open(outfileloc, "w") as lrpof:
+ lrpof.write(f"#VBUUID: {vbuuid}\n")
lrpof.write("#ContextId\tLeftPos\tRightPos\n")
for varcon in self.variant_contexts.values():
leftpositions = [
@@ -681,70 +1324,121 @@ def write_donor_left_right_positions(self, outfileloc):
lrpof.write(str(varcon.get_variant_context_id()) + "\t"
+ ",".join(leftpositions) + "\t"
+ ",".join(rightpositions) + "\n")
- except IOError as ioe:
+ except IOError:
self.vaselogger.warning("Could not write read left positions to "
f"output file {outfileloc}")
# Writes the identifiers of reads that have unmapped mates per
# sample to a file. Samples are all donors and the ?template?.
- def write_reads_with_unmapped_mate(self, typetowrite, umfileloc):
+ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc, vbuuid):
+ """Writes variant context read identifiers with unmapped mates to a specified output file.
+
+ Parameters
+ ----------
+ typetowrite : str
+ Write variant context acceptor or donor read identifiers
+ umfileloc : str
+ Path to write the output file to
+ """
try:
- with open(umfileloc, "w") as umFile:
- umFile.write("#ContextId\tSampleId\tReadIds\n")
+ with open(umfileloc, "w") as umfile:
+ umfile.write(f"#VBUUID: {vbuuid}\n")
+ umfile.write("#ContextId\tSampleId\tReadIds\n")
for varcon in self.variant_contexts.values():
if typetowrite == "acceptor":
- umFile.write(
+ umfile.write(
varcon.get_variant_context_id() + "\t"
+ str(varcon.get_variant_context_sample()) + "\t"
+ ";".join(varcon.get_unmapped_acceptor_mate_ids())
- )
+ + "\n")
if typetowrite == "donor":
- umFile.write(
+ umfile.write(
varcon.get_variant_context_id() + "\t"
+ str(varcon.get_variant_context_sample()) + "\t"
+ ";".join(varcon.get_unmapped_donor_mate_ids())
- )
+ + "\n")
except IOError:
self.vaselogger.warning("Could not write read identifiers of "
"reads with unmapped mates to "
f"{umfileloc}")
- # Writes the unmapped mate id of the acceptor context.
- def write_acceptor_unmapped_mates(self, umfileloc):
- try:
+ def write_acceptor_unmapped_mates(self, umfileloc, vbuuid):
+ """Writes the acceptor context read identifiers that have unmapped mates to an output file.
+ Parameters
+ ----------
+ umfileloc : str
+ Path to write the output file to
+ """
+ try:
with open(umfileloc, "w") as umfile:
+ umfile.write(f"#VBUUID: {vbuuid}\n")
umfile.write("#ContextId\tSampleId\tReadIds\n")
for varcon in self.variant_contexts.values():
acccon = varcon.get_acceptor_context()
umfile.write(str(acccon.get_context_id()) + "\t"
+ str(acccon.get_sample_id()) + "\t"
- + ";".join(acccon.get_unmapped_read_mate_ids()))
+ + ";".join(acccon.get_unmapped_read_mate_ids()) + "\n")
except IOError:
self.vaselogger.warning("Could not write read identifiers of "
"reads with unmapped mates to "
f"{umfileloc}")
- # Writes the unmapped mate id of the donor context.
- def write_donor_unmapped_mates(self, umfileloc):
+ def write_donor_unmapped_mates(self, umfileloc, vbuuid):
+ """Writes the donor context read identifiers that have unmapped mates to an output file.
+
+ Parameters
+ ----------
+ umfileloc : str
+ Path to write the output file to
+ """
try:
- with open(umfileloc, "w") as umFile:
- umFile.write("#ContextId\tSampleId\tReadIds\n")
+ with open(umfileloc, "w") as umfile:
+ umfile.write(f"#VBUUID: {vbuuid}\n")
+ umfile.write("#ContextId\tSampleId\tReadIds\n")
for varcon in self.variant_contexts.values():
doncon = varcon.get_donor_context()
- umFile.write(str(doncon.get_context_id()) + "\t"
+ umfile.write(str(doncon.get_context_id()) + "\t"
+ str(doncon.get_sample_id()) + "\t"
- + ";".join(doncon.get_unmapped_read_mate_ids()))
+ + ";".join(doncon.get_unmapped_read_mate_ids())+"\n")
except IOError:
self.vaselogger.warning("Could not write read identifiers of "
"reads with unmapped mates to "
f"{umfileloc}")
- # Compares the current VariantContextFile to another variant context file
def compare(self, othervarconfile, contextfilter=None):
+ """Compares the current variant context file to another provided variant context file.
+
+ Parameters
+ ----------
+ othervarconfile : VariantContextFile
+ VariantContextFile to compare against
+ contextfilter : list of str or None
+ Contexts to compare (inclusive filter)
+
+ Returns
+ -------
+ varcondiffs : dict
+ Differences between the current and other VariantContextFile
+ """
varcondiffs = {}
for contextid in self.variant_contexts:
if self.passes_filter(contextid, contextfilter):
diffs = self.variant_contexts[contextid].compare(othervarconfile.get_variant_context(contextid))
varcondiffs[contextid] = diffs
return varcondiffs
+
+ def add_variant_context_file(self, variantcontextfile):
+ """Adds another VariantContextFile to the existing one.
+
+ Variant contexts from the second file overlapping with variant contexts from the current file will not be added.
+
+ Parameters
+ ----------
+ variantcontextfile : VariantContextFile
+ Variant context file to add to the current
+ """
+ for contextid, varcon in variantcontextfile.get_variant_contexts(True).items():
+ if contextid not in self.variant_contexts:
+ if not self.context_collision(varcon.get_context()):
+ self.variant_contexts[contextid] = varcon
diff --git a/VcfBamScanner.py b/VcfBamScanner.py
index e47bb90..e9e589e 100644
--- a/VcfBamScanner.py
+++ b/VcfBamScanner.py
@@ -6,24 +6,61 @@
class VcfBamScanner:
- # Constructor that creates two empty hashmaps (dicts).
+ """VcfBamScanner offers functionality to scan variant (VCF/BCF) and alignment (BAM/CRAM) files.
+
+ The VcfBamScanner can be used to scan folders for variant and alignments files, check whether variant and alignment
+ files have sample names and extract them. A list file containing paths to either variant or alignment files can
+ also be.scanned. Files within the list files will be a saved per sample name in an identifier.
+
+ Attributes
+ ----------
+ vcf_sample_map : dict
+ Dictionary with variant files per sample
+ bam_sample_map : dict
+ Dictionary with alignment files per sample
+ valid_file_types : list of str
+ Valid variant and alignment file types
+ """
+
def __init__(self):
+ """Obtains the vase logger and creates two empty dictionaries to save the variant and alignment files."""
self.vaselogger = logging.getLogger("VaSe_Logger")
self.vcf_sample_map = {}
self.bam_sample_map = {}
self.valid_file_types = ["VCF", "BCF", "BAM", "CRAM"]
# ===METHODS TO GET SAVED DATA FROM THE VCFBAMSCANNER======================
- # Returns the map that links VCF files and samples.
def get_vcf_sample_map(self):
+ """Returns a dictionary with VCF/BCF files saved per sample.
+
+ Returns
+ -------
+ vcf_sample_map : dict
+ Paths to variant files per sample name
+ """
return self.vcf_sample_map
- # Returns the map that links BAM files and samples.
def get_bam_sample_map(self):
+ """Returns a dictionary with BAM/CRAM files saved per sample.
+
+ Returns
+ -------
+ bam_sample_map : dict
+ Paths to alignment files per sample name
+ """
return self.bam_sample_map
- # Returns a map that links VCF files to BAM files.
def get_vcf_to_bam_map(self):
+ """Returns a dictionary with a variant file linked to an alignment file of the same sample.
+
+ For each sample, the associated variant file is linked to the associated.alignment file are linked via a
+ dictionary. The variant and alignment file are saved as key and value respectively.
+
+ Returns
+ -------
+ vcf_to_bam_map : dict
+ Variant files linked to alignment files
+ """
vcf_to_bam_map = {}
for sampleid in self.vcf_sample_map:
if sampleid in self.bam_sample_map:
@@ -34,6 +71,23 @@ def get_vcf_to_bam_map(self):
# Scans the folders containing VCF files and returns a map that
# links sample ids with vcf files.
def scan_vcf_folders(self, vcffolders):
+ """Scans a list of folders containing variant files and returns a dictionary with valid files per sample name.
+
+ Iterates over a list of paths to folders containing VCF/BCF files. A non-existing folder is skipped. For each
+ variant file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a
+ dictionary.
+
+ Parameters
+ ----------
+ vcffolders : list
+ List of paths to folders contain alignment files
+
+ Returns
+ -------
+ vcf_sample_map : dict
+ String paths to variant files per sample name
+
+ """
self.vaselogger.info("Start scannig VCF files")
for vcffolder in vcffolders:
@@ -68,9 +122,23 @@ def scan_vcf_folders(self, vcffolders):
self.vaselogger.info("Finished scanning VCF files")
return self.vcf_sample_map
- # Scans the folders containing BAM files and returns a map that
- # links sample ids with bam files.
def scan_bam_folders(self, bamfolders):
+ """Scans a list of folders containing alignment files and returns a dictionary with valid files per sample name.
+
+ Iterates over a list of paths to folders containing BAM/CRAM files. Non-existing folders are skipped. For each
+ alignment file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a
+ dictionary.
+
+ Parameters
+ ----------
+ bamfolders : list
+ List of String paths to folders containing alignment files
+
+ Returns
+ -------
+ bam_sample_map : dict
+ String paths to alignment files per sample name
+ """
self.vaselogger.info("Start scanning BAM files")
# Scan BAM files in all provided folders.
@@ -107,16 +175,53 @@ def scan_bam_folders(self, bamfolders):
# Scans the provided BAM/CRAM files from a provided donor list file
def scan_bamcram_files(self, bamcramlistfile):
+ """Reads a list file containing the locations of BAM/CRAM files and saves them per sample.
+
+ Parameters
+ ----------
+ bamcramlistfile : str
+ String path to list file with paths to alignment files
+
+ Returns
+ -------
+ bam_sample_map : dict
+ Paths to alignment files per sample name
+ """
self.bam_sample_map = self.read_donorlistfile(bamcramlistfile)
return self.bam_sample_map
# Scan the provided VCF files from a provided donor list file
def scan_vcf_files(self, vcflistfile):
+ """Reads a list file containing the locations of VCF/BCF files and saves them per sample name.
+
+ Parameters
+ ----------
+ vcflistfile : str
+ String path to list file with paths to variant files
+
+ Returns
+ -------
+ vcf_sample_map : dict
+ Variant files per sample name
+ """
self.vcf_sample_map = self.read_donorlistfile(vcflistfile, "v")
return self.vcf_sample_map
- # Read a specific donor list file
def read_donorlistfile(self, listfileloc, listtype="a"):
+ """Reads a list file containing paths to donor files and returns the donor files per sample name.
+
+ Parameters
+ ----------
+ listfileloc : str
+ Path to the list file containing paths to donor files
+ listtype : str
+ Type of variant files in the list file (a=alignment ; v=variant)
+
+ Returns
+ -------
+ donor_sample_files : dict
+ Paths to donor variant files per sample name
+ """
donor_sample_files = {}
try:
with open(listfileloc, "r") as listfile:
@@ -135,29 +240,74 @@ def read_donorlistfile(self, listfileloc, listtype="a"):
exit()
return donor_sample_files
- # Checks whether the BAM file contains a sample name.
def bam_has_sample_name(self, bamfile):
+ """Checks and returns whether a BAM/CRAM file has a sample name/identifier.
+
+ Parameters
+ ----------
+ bamfile : pysam AlignmentFile
+ Already opened pysam AlignmentFile
+
+ Returns
+ -------
+ bool
+ True if file has a sample name, otherwise False
+ """
if "RG" in bamfile.header:
if len(bamfile.header["RG"]) > 0:
if "SM" in bamfile.header["RG"][0]:
return True
return False
- # Checks whether the VCF file has a sample name
def vcf_has_sample_name(self, vcffile):
+ """Checks and returns whether the VCF/BCF file has a sample name/identifier.
+
+ Returns
+ -------
+ bool
+ True if the variant file has sample names, otherwise False
+ """
if vcffile.header.samples[0] != "":
return True
return False
# Returns the first sample identifier of the VCF file (as for now we only work with one sample per file
def get_vcf_sample_name(self, vcffileloc):
+ """Extracts and returns the list of sample names from a specified VCF file.
+
+ Parameters
+ ----------
+ vcffileloc : str
+ Path to a VCF file
+
+ Returns
+ -------
+ list or None
+ VCF Sample name if present, otherwise None
+ """
vcffile = pysam.VariantFile(vcffileloc, "r")
if self.vcf_has_sample_name(vcffile):
return list(vcffile.header.samples)
return None
- # Returns the BAM sample
def get_bam_sample_name(self, bamfileloc):
+ """Extracts and returns the sample name/identifier from a BAM file.
+
+ The sample name is extracted by first checking whether the BAM file contains a sample name via the
+ VcfBamScanner method bam_has_sample_name(). If so, the sample name is extracted from the BAM file by returning
+ the value of the 'SM' field from the header line starting with '@RG'. None is returned if the BAM file has no
+ sample name.
+
+ Parameters
+ ----------
+ bamfileloc : str
+ Path to a BAM file
+
+ Returns
+ -------
+ str or None
+ Sample name if present, otherwise None
+ """
try:
bamfile = pysam.AlignmentFile(bamfileloc, "rb")
if self.bam_has_sample_name(bamfile):
@@ -168,8 +318,24 @@ def get_bam_sample_name(self, bamfileloc):
except IOError:
self.vaselogger.warning(f"Could not open {bamfileloc} to extract sample identifier")
- # Returns the CRAM sample name
def get_cram_sample_name(self, cramfileloc):
+ """Extracts and returns the sample name/identifier from a specified CRAM file.
+
+ The sample name is extracted by first checking whether the CRAM file contains a sample name via the
+ VcfBamScanner method cram_has_sample_name(). If so, the sample name is extracted from the CRAM file by
+ returning the value of the 'SM' field from the header line starting with '@RG'. None is returned if the CRAM
+ file has no sample name.
+
+ Parameters
+ ----------
+ cramfileloc : str
+ Path to a CRAM file
+
+ Returns
+ -------
+ str or None
+ Sample name if present, otherwise None
+ """
try:
cramfile = pysam.AlignmentFile(cramfileloc, "rc")
if self.bam_has_sample_name(cramfile):
@@ -181,6 +347,20 @@ def get_cram_sample_name(self, cramfileloc):
# General methods that returns one or more sample ids (in case of a VCF)
def get_sample_id(self, donorfileloc, donorlisttype):
+ """Returns the sample name/identifier for a specified alignment or variant file.
+
+ Parameters
+ ----------
+ donorfileloc : str
+ Path to a donor variant/alignment file
+ donorlisttype : str
+ Type of donor file (a=alignment ; v=variant)
+
+ Returns
+ -------
+ str
+ File type of provided donor file
+ """
donor_file_type = self.get_donor_file_type(donorfileloc)
if donorlisttype == "a":
if donor_file_type[1] == "BAM":
@@ -193,24 +373,71 @@ def get_sample_id(self, donorfileloc, donorlisttype):
# Returns the filetype of a donor file
def get_donor_file_type(self, donorfileloc):
+ """Determines and returns the file type of a specified donor file.
+
+ To determine the file type, the linux command 'file' is used which returns a String containing the file type
+ info. The received String is split on spaces and the resulting list is returned.
+
+ Returns
+ -------
+ list of str
+ List with file type data
+ """
filetype_proc = subprocess.Popen(["file", "-b", "-z", donorfileloc], stdout=subprocess.PIPE)
filetype_data, filetype_err = filetype_proc.communicate()
return filetype_data.decode().split(" ")
# Returns a list of sample identifiers that have both a VCF and a BAM/CRAM
def get_complete_sample_ids(self):
+ """Determines which samples have a variant and alignment file and returns the list of identifiers.
+
+ Returns
+ -------
+ set
+ Set of sample names with a variant and alignment file
+ """
return set(self.vcf_sample_map.keys()) & set(self.bam_sample_map.keys())
# Extracts the sequence names from an already opened BAM/CRAM file
def get_alignment_sequence_names(self, alignment_file):
+ """Extracts and returns the chromosome names from an alignment file.
+
+ Chromosome names are extracted from a specified BAM or CRAM file by checking the headers for the 'SQ' tag. If
+ present the SQ entry is checked for the 'SN' field which contains the specific chromosome names.
+
+ Parameters
+ ----------
+ alignment_file : pysam.AlignmentFile
+ Already opened pysam AlignmentFile
+
+ Returns
+ -------
+ sequence_names : list
+ List with extracted chromosome names
+ """
sequence_names = set()
if "SQ" in alignment_file.header:
for sn_entry in alignment_file.header["SQ"]:
sequence_names.add(sn_entry["SN"])
return sequence_names
- # Extracts the sequence names from an already opened VCF/BCF file
def get_variant_sequence_names(self, variant_file):
+ """Extracts and returns the chromosome names from a variant file.
+
+ Chromosome names are extracted from a specified VCF or BCF file by obtaining the header contig names via the
+ pysam method .header.contigs(). The provided variant file is expected to be an already opened pysam
+ VariantFile.
+
+ Parameters
+ ----------
+ variant_file : pysam.VariantFile
+ Already opened pysam VariantFile
+
+ Returns
+ -------
+ sequence_names : list of str
+ List with extracted chromosome names
+ """
sequence_names = set()
if len(variant_file.header.contigs) > 0:
for seqname in list(variant_file.header.contigs):
diff --git a/VcfVariant.py b/VcfVariant.py
index f705a37..5a0c213 100644
--- a/VcfVariant.py
+++ b/VcfVariant.py
@@ -2,6 +2,23 @@
class VcfVariant:
+ """Saves necessary data of a genomic variant from a VCF/BCF file.
+
+ Attributes
+ ----------
+ vcf_variant_chrom : str
+ Chromosome name the variant is located on
+ vcf_variant_start : int
+ The leftmost genomic position of the variant
+ vcf_variant_ref : str
+ The reference allele of the variant
+ vcf_variant_alts : tuple
+ The alternative allele(s) of the variant
+ vcf_variant_filter : str
+ Filter applied to the variant (i.e. PASS)
+ vcf_variant_type : str
+ Type of the variant (SNP/Indel/etc)
+ """
def __init__(self, varchrom, varstart, varref, varalts, varfilter, vartype):
self.vcf_variant_chrom = varchrom
self.vcf_variant_start = varstart
@@ -10,59 +27,147 @@ def __init__(self, varchrom, varstart, varref, varalts, varfilter, vartype):
self.vcf_variant_filter = varfilter
self.vcf_variant_type = vartype
- # Returns the chromosome name of the VCF variant
def get_variant_chrom(self):
+ """Returns the chromosome name the variant is located on
+
+ Returns
+ -------
+ self.vcf_variant_chrom : str
+ Chromosome name the variant is located on
+ """
return self.vcf_variant_chrom
- # Sets the VCF variant chromosome
def set_variant_chrom(self, varchrom):
+ """Sets the chromosome name the variant is located on. Overwrites an already set chromosome name.
+
+ Parameters
+ ----------
+ varchrom : str
+ Chromosome name the variant is located on
+ :return:
+ """
self.vcf_variant_chrom = varchrom
- # Returns the VCF variant start position
def get_variant_pos(self):
+ """Returns the genomic position of the variant.
+
+ Returns
+ -------
+ self.vcf_variant_pos : nt
+ Genomic position of the variant
+ """
return self.vcf_variant_start
- # Sets the VCF variant starting position
def set_variant_pos(self, varpos):
+ """Sets the variant genomic position. Overwrites an already set genomic position.
+
+ Parameters
+ ----------
+ varpos : int
+ Genomic position of the variant.
+ """
self.vcf_variant_start = varpos
- # Returns the VCF variant reference allele
def get_variant_ref_allele(self):
+ """Returns the reference allele of the variant.
+
+ Returns
+ -------
+ self.vcf_variant_ref : str
+ Reference allele of the variant
+ :return:
+ """
return self.vcf_variant_ref
- # Sets the VCF variant reference allele
def set_variant_ref_allele(self, varref):
+ """Sets the reference allele of the variant. Overwrites an already set reference allele.
+
+ Parameters
+ ----------
+ varref : str
+ Reference allele of the variant
+ """
self.vcf_variant_ref = varref
- # Returns the VCF variant alternative alleles
def get_variant_alt_alleles(self):
+ """Returns the alternative allele(s) of the variant.
+
+ Returns
+ -------
+ self.vcf_variant_alts : tuple
+ Alternative allele(s) of the variant
+ """
return self.vcf_variant_alts
- # Sets the VCF variant alternative alleles
def set_variant_alt_alleles(self, varalts):
+ """Sets the alternative allele(s) of the variants. Overwrites an already set alternative allele(s).
+
+ Parameters
+ ----------
+ varalts : tuple
+ Alternative allele(s) of the variant
+ """
self.vcf_variant_alts = varalts
- # Returns the VCF variant filter
def get_variant_filter(self):
+ """Returns the filter (i.e. PASS) that was applied to the variant
+
+ Returns
+ -------
+ self.vcf_variant_filter : str
+
+ """
return self.vcf_variant_filter
- # Sets the VCF variant filter
def set_variant_filter(self, varfilter):
+ """Sets the filter applied to the variant. Overwrites any already set variant filter.
+
+ Parameters
+ ----------
+ varfilter : str
+
+ """
self.vcf_variant_filter = varfilter
- # Returns the VCF variant type (SNP/Indel, etc)
def get_variant_type(self):
+ """Returns the variant type.
+
+ Returns
+ -------
+ self.vcf_variant_type : str
+ Variant type
+ """
return self.vcf_variant_type
- # Sets the VCF variant type
def set_variant_type(self, vartype):
+ """Sets the type of the variant. Overwrites any already set variant type.
+
+ Parameters
+ ----------
+ vartype : str
+ """
self.vcf_variant_type = vartype
- # Returns a variant identifier in the form CHROM_POS
def get_variant_id(self):
+ """Constructs and returns a variant identifier.
+
+ The identifier is formed by combining the chromosome name and variant position, separated by an '_'
+ (i.e. CHROM_POS). This will be used as the identifier for contexts.
+
+ Returns
+ -------
+ str
+ Constructed variant identifier
+ """
return f"{self.vcf_variant_chrom}_{self.vcf_variant_start}"
- # ToString method
def to_string(self):
+ """Returns a String representation of the variant.
+
+ Returns
+ -------
+ str
+ String representation of the variant
+ """
return f"{self.vcf_variant_chrom}\t{self.vcf_variant_start}\t{self.vcf_variant_type}\t{self.vcf_variant_ref}" \
f"\t{self.vcf_variant_alts}\t{self.vcf_variant_filter}"
diff --git a/combine_varcons.py b/combine_varcons.py
new file mode 100644
index 0000000..368f58c
--- /dev/null
+++ b/combine_varcons.py
@@ -0,0 +1,79 @@
+import os
+import argparse
+
+
+# Returns the command line parameters
+def get_parameters():
+ combine_varcon_params = argparse.ArgumentParser()
+ combine_varcon_params.add_argument("-l", "--listfile", dest="listfile",
+ help="Listfile containing locations to variant context files to combine")
+ combine_varcon_params.add_argument("-i", "--infiles", dest="infiles", nargs="+",
+ help="Variant context files to combine.")
+ combine_varcon_params.add_argument("-o", "--out", dest="outfile")
+ combine_varcon_args = vars(combine_varcon_params.parse_args())
+ return combine_varcon_args
+
+
+# Reads the variant context list file
+def read_varcon_list_file(varcon_listfile):
+ varcon_files = []
+ try:
+ with open(varcon_listfile, 'r') as vclf:
+ for fileline in vclf:
+ if os.path.isfile(fileline.strip()):
+ varcon_files.append(fileline.strip())
+ except IOError:
+ print(f"Could not read variant context list file: {varcon_listfile}")
+ finally:
+ return varcon_files
+
+
+def combine_varcons(varconfiles):
+ varcondata = {}
+ for varconfileloc in varconfiles:
+ varcondata = process_varconfile(varconfileloc, varcondata)
+ return varcondata
+
+
+def process_varconfile(varconfileloc, varcondata):
+ with open(varconfileloc, 'r') as vrcfile:
+ for fileline in vrcfile:
+ if not fileline.startswith("#"):
+ varconline = fileline.strip()
+ filelinedata = fileline.strip().split("\t")
+
+ if not detect_collision(filelinedata, varcondata):
+ varcondata[filelinedata[0]] = varconline
+ return varcondata
+
+
+def detect_collision(varconentry, varcondatamap):
+ if varconentry[0] in varcondatamap:
+ return True
+ else:
+ for varcon in varcondatamap.values():
+ if varcon[2] == varconentry[2]:
+ if int(varcon[4]) <= int(varconentry[5]) and int(varconentry[4]) <= int(varcon[5]):
+ return True
+ return False
+
+
+def write_new_variantcontextfile(outfileloc, varcondatamap):
+ with open(outfileloc, 'w') as vrcoutfile:
+ vrcoutfile.write("#ContextId\tDonorSample\tChrom\tOrigin\tStart\tEnd\tAcceptorContextLength\t"
+ "DonorContextLength\tAcceptorReads\tDonorReads\tADratio\tAcceptorReadsIds\tDonorReadIds\n")
+ for varcondataentry in varcondatamap.values():
+ vrcoutfile.write(f"{varcondataentry}\n")
+
+
+# Do the actual work
+cli_params = get_parameters()
+varcons_to_combine = []
+if cli_params['listfile'] is not None:
+ varcons_to_combine = read_varcon_list_file(cli_params['listfile'])
+else:
+ varcons_to_combine = [infile for infile in cli_params['infiles'] if os.path.isfile(infile)]
+
+varcondata = combine_varcons(varcons_to_combine)
+write_new_variantcontextfile(cli_params['outfile'], varcondata)
+
diff --git a/config/examples/example_amode.cfg b/config/examples/example_amode.cfg
index fbc71c0..90fed3e 100644
--- a/config/examples/example_amode.cfg
+++ b/config/examples/example_amode.cfg
@@ -1,8 +1,8 @@
-#Example config file for running VaSeBuilder in A-mode
-RUNMODE=A
+#Example config file for running VaSeBuilder in AC-mode
+RUNMODE=AC
TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq
TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq
VARCONIN=/testdata/variantcontext/varcon.txt
DONORFASTQS=/testdata/donorfastq/donorfastq_list.txt
OUT=/testdata/outdir
-LOGFILE=/testdata/outdir/vb.log
+LOG=/testdata/outdir/vb.log
diff --git a/config/examples/example_dcmode.cfg b/config/examples/example_dcmode.cfg
index c50f543..f9b1724 100644
--- a/config/examples/example_dcmode.cfg
+++ b/config/examples/example_dcmode.cfg
@@ -6,3 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam
REFERENCE=/testdata/reference/genome_ref.fasta
VARCONIN=/testdata/variantcontext/varcon.txt
OUT=/testdata/outdir
+LOG=/testdata/outdir/vb.log
diff --git a/config/examples/example_dmode.cfg b/config/examples/example_dmode.cfg
index 41c201d..92a332d 100644
--- a/config/examples/example_dmode.cfg
+++ b/config/examples/example_dmode.cfg
@@ -1,7 +1,8 @@
#Example config file for running VaSeBuilder in D-mode
RUNMODE=D
DONORVCF=/testdata/donordata/vcffiles_list.txt
-DONORBAM=
+DONORBAM=/testdata/donordata/bamfiles_list.txt
ACCEPTORBAM=/testdata/acceptor/acceptor.bam
REFERENCE=/testdata/reference/genome_ref.fasta
OUT=/testdata/outdir
+LOG=/testdata/outdir/vb.log
diff --git a/config/examples/example_fcmode.cfg b/config/examples/example_fcmode.cfg
index 244766b..532c21b 100644
--- a/config/examples/example_fcmode.cfg
+++ b/config/examples/example_fcmode.cfg
@@ -3,8 +3,9 @@ RUNMODE=FC
DONORVCF=/testdata/donordata/vcffile_list.txt
DONORBAM=/testdata/donordata/bamfile_list.txt
ACCEPTORBAM=/testdata/acceptor/acceptor.bam
-TEMPLATEFQ1=/testdata/templatefastq/acceptor_R2.fq
-TEMPLATEFQ2=/testdata/templatefastq/acceptor_R1.fq
+TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1_L1.fq , /testdata/templatefastq/acceptor_R1_L2.fq
+TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2_L1.fq , /testdata/templatefastq/acceptor_R2_L2.fq
VARCONIN=/testdata/variantcontext/varcon.txt
REFERENCE=/testdata/reference/genome_ref.fasta
OUT=/testdata/outdir
+LOG=/testdata/outdir/vb.log
diff --git a/config/examples/example_fmode.cfg b/config/examples/example_fmode.cfg
index 5c7268c..65015c4 100644
--- a/config/examples/example_fmode.cfg
+++ b/config/examples/example_fmode.cfg
@@ -3,7 +3,8 @@ RUNMODE=F
DONORVCF=/testdata/donordata/vcffile_list.txt
DONORBAM=/testdata/donordata/bamfile_list.txt
ACCEPTORBAM=/testdata/acceptor/acceptor.bam
-TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq
-TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq
+TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1_L1.fq , /testdata/templatefastq/acceptor_R1_L2.fq
+TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2_L1.fq , /testdata/templatefastq/acceptor_R2_L2.fq
REFERENCE=/testdata/reference/genome_ref.fasta
OUT=/testdata/outdir
+LOG=/testdata/outdir/vb.log
\ No newline at end of file
diff --git a/config/examples/example_fmode_debug.cfg b/config/examples/example_fmode_debug.cfg
new file mode 100644
index 0000000..0a3371f
--- /dev/null
+++ b/config/examples/example_fmode_debug.cfg
@@ -0,0 +1,11 @@
+#Example config file for running VaSeBuilder in F-mode with debug set on
+RUNMODE=F
+DONORVCF=/testdata/donordata/vcffile_list.txt
+DONORBAM=/testdata/donordata/bamfile_list.txt
+ACCEPTORBAM=/testdata/acceptor/acceptor.bam
+TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq
+TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq
+REFERENCE=/testdata/reference/genome_ref.fasta
+OUT=/testdata/outdir
+LOG=/testdata/outdir/vb.log
+DEBUG=T
\ No newline at end of file
diff --git a/config/examples/example_fmode_variantlist.cfg b/config/examples/example_fmode_variantlist.cfg
new file mode 100644
index 0000000..f7b133d
--- /dev/null
+++ b/config/examples/example_fmode_variantlist.cfg
@@ -0,0 +1,12 @@
+#Example config file for running VaSeBuilder in F-mode
+#This example has a provided list of variants to use for the run
+RUNMODE=F
+DONORVCF=/testdata/donordata/vcffile_list.txt
+DONORBAM=/testdata/donordata/bamfile_list.txt
+ACCEPTORBAM=/testdata/acceptor/acceptor.bam
+TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq
+TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq
+REFERENCE=/testdata/reference/genome_ref.fasta
+OUT=/testdata/outdir
+LOG=/testdata/outdir/vb.log
+VARIANTLIST=/testdata/variants/variantlist.txt
diff --git a/config/examples/example_pcmode.cfg b/config/examples/example_pcmode.cfg
index 828d906..668ab65 100644
--- a/config/examples/example_pcmode.cfg
+++ b/config/examples/example_pcmode.cfg
@@ -6,4 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam
REFERENCE=/testdata/reference/genome_ref.fasta
VARCONIN=/testdata/variantcontext/varcon.txt
OUT=/testdata/outdir
-LOGFILE=/testdata/outdir/vb.log
+LOG=/testdata/outdir/vb.log
diff --git a/config/examples/example_pmode.cfg b/config/examples/example_pmode.cfg
index 81c1651..39377dd 100644
--- a/config/examples/example_pmode.cfg
+++ b/config/examples/example_pmode.cfg
@@ -5,4 +5,4 @@ DONORBAM=/testdata/donordata/bamfile_list.txt
ACCEPTORBAM=/testdata/acceptor/acceptor.bam
REFERENCE=/testdata/reference/genome_ref.fa
OUT=/testdata/outdir
-LOGFILE=/testdata/outdir/vb.log
+LOG=/testdata/outdir/vb.log
diff --git a/config/examples/example_xcmode.cfg b/config/examples/example_xcmode.cfg
index aa5854b..3bcdb66 100644
--- a/config/examples/example_xcmode.cfg
+++ b/config/examples/example_xcmode.cfg
@@ -6,4 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam
REFERENCE=/testdata/reference/genome_ref.fa
VARCONIN=/testdata/variantcontext/varcon.txt
OUT=/testdata/outdir
-LOGFILE=/testdata/outdir/vb.log
+LOG=/testdata/outdir/vb.log
diff --git a/config/examples/example_xmode.cfg b/config/examples/example_xmode.cfg
index 207199d..8ae6adc 100644
--- a/config/examples/example_xmode.cfg
+++ b/config/examples/example_xmode.cfg
@@ -5,4 +5,4 @@ DONORBAM=/testdata/donordata/bamfile_list.txt
ACCEPTORBAM=/testdata/acceptor/acceptor.bam
REFERENCE=/testdata/reference/genome_ref.fa
OUT=/testdata/outdir
-LOGFILE=/testdata/outdir/vb.log
+LOG=/testdata/outdir/vb.log
diff --git a/docs/input_formats.md b/docs/input_formats.md
new file mode 100644
index 0000000..5c6f83f
--- /dev/null
+++ b/docs/input_formats.md
@@ -0,0 +1,74 @@
+# VaSeBuilder input file formats
+
+## About input files
+To run VaSeBuilder, specific VaSeBuilder input files might need to be provided depending on the selected run mode.
+
+
+
+### Donor variant and donor alignment list files
+The donor variant list file should contain a list of paths to a variant, VCF or BCF, file with one path per file line.
+The donor alignment list file should be similar and contain one path to a donor alignment file per file line.
+The files listed will be used as donor variant and alignments files respectively when constructing variant contexts.
+Only paths to the variant/alignment files should be provided, not the index files. These are assumed to be in the same
+directory with the same name, i.e.: sample.bam & sample.bam.bai
+
+
+
+### Donor fastq list file
+A donor fastq list file needs to be provided when running VaSeBuilder AC-mode. Currently (September 23rd, 2019) this
+file requires two columns without a header. The first column should contain the paths to R1 files, the second column
+ the paths to R2 files:
+
+_/path/to/sample1_R1.fq /path/to/sample1_R2.fq
+/path/to/sample2_R1.fq /path/to/sample2_R2.fq
+/path/to/sample3_R1.fq /path/to/sample3_R2.fq_
+
+Each row in this file therefore represents a single sample.
+
+
+
+### Variant list file
+A variant list file can be provided as a filter to indicate which donor variants to use in the VaSeBuilder run. This is
+most useful when only certain variants from the donors are required but the donor variant files contain many variants.
+Variants not specified in the variant list will therefore be skipped.
+The variant list file needs at least three columns: sample name/identifier, chromosome name, variant position:
+
+_Sample1 1 101
+Sample2 2 202
_
+
+(In the future two more columns might be required: variant reference allele, variant alternative allele)
+
+
+
+### Variant context file
+A VaSeBuilder variant context file can serve as an input file when running AC, DC, FC or PC mode. Please see the output
+files description for the variant context file for it's structure.
+
+
+
+### Configuration file
+Configuration files allow users to save the VaSeBuilder run parameters and values in a text file. This makes rerunning
+analyses on the same data, as well as sharing the run parameters much easier. Parameters and values can be specified as
+PARAMETERNAME=value. Parameter names do not need to be capitalized, entries such as parametername = value are also
+valid. Parameters templatefq1 and templatefq2 can have multiple values separated by a comma:
+templatefq1 = testdata/fastqs/acceptor1_1.fq , testdata/fastqs/acceptor1_2.fq. (The space around the comma is optional)
+Comments explaining or describing the use of the configuration file can be added by starting a line with #.
+
+
Valid configuration file parameter names:
+* __RUNMODE:__ The run mode of VaSeBuilder to use
+* __DONORVCF:__ Path to file with list of donor variant files to use
+* __DONORBAM:__ Path to file with list of donor alignment files to use
+* __ACCEPTORBAM:__ Path the alignment file to use as acceptor
+* __TEMPLATEFQ1:__ Path(s) to R1 fastq file(s) to use as template for building the validation set
+* __TEMPLATEFQ2:__ Path(s) to R2 fastq file(s) to use as template for building the validation set
+* __REFERENCE:__ Path to the fasta genome reference
+* __OUT:__ Path to directory to write output files to
+* __LOG:__ Path to write log file to
+* __DEBUG:__ Run debug mode (valid values: T, True, true, TRUE, 1).
+* __VARIANTLIST:__ Path to file containing variants to use
+* __VARCONIN:__ Path to a variant context file
+* __FASTQOUT:__ Name prefix for validation fastq files
+* __VARCON:__ Name prefix for variant context output file
+
+To run VaSeBuilder with a configuration file use: python vase.py –c path/to/config/file.cfg
+Please see the example config files in VaSeBuilder/config/examples for examples per run mode
diff --git a/docs/output_formats.md b/docs/output_formats.md
new file mode 100644
index 0000000..8903cdf
--- /dev/null
+++ b/docs/output_formats.md
@@ -0,0 +1,178 @@
+# VaSeBuilder output file formats
+
+## Standard output files
+VaSeBuilder runs output several output files containing the essential data about the run. Some files are only outputted
+when running specific modes.
+
+
+
+### Donor alignment files
+A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. A list could be:
+
+_/path/to/donor1.bam
+/path/to/donor2.cram
+/path/to/donor3.bam_
+
+
+
+### Donor insert positions
+VaSeBuilder add donor reads at semi random positions in the resulting validation set. The recorded position is the
+position of a full read (in a fastq a donor read with insert position 1 would be written to lines 5,6,7 and 8).
+
+
+_\#VBUUID {vbuuid}
+ReadId R1_InsertPos FastqR1Out R2_InsertPos FastqR2Out
+dReadId1 15 /testdata/vase_R1.fastq 15 /testdata/vase_R2.fastq_
+
+
+
+### Donor variant files
+A list of paths to donor variant (VCF/BCF) files used in building the variant contexts. A list could be:
+_/path/to/donor1.vcf
+/path/to/donor2.bcf
+/path/to/donor3.vcf_
+
+
+
+### Log file
+Contains a log of steps and activities performed by VaSeBuilder. More info is added to the log when debug is activated.
+The log file contains four fields separated by a tab: Date and time, name of the logger (VaSe_Logger), log level,
+message. The first two lines of the log always display the issued command to run VaSeBuilder and the VaSeBuilder uuid
+and creation date and time A VaSeBuilder log file therefore follows the format:
+
+_YYYY-MM-DD HH:MM:SS,SSS INFO python vase.py -c test.cfg
+YYYY-MM-DD HH:MM:SS,SSS INFO VaSeBuilder: {uuid} ; YYYY-MM-DD HH:MM:SS,SSS
+YYYY-MM-DD HH:MM:SS,SSS INFO Running VaSeBuilder in F-mode_
+
+
+
+### P-mode link file
+When VaSeBuilder is run in P-mode, donor fastq files are outputted for each variant context. To link the variant
+context file and the outputted donor fastq files, an extra file is created consisting of three columns:
+
+_contextid_1 /path/to/fastq_R1.fq /path/to/fastq_R1.fq
+contextid_2 /path/to/fastq_R2.fq /path/to/fastq_R2.fq_
+
+
+Each line represents a variant context. The first line in the P-mode link file is the uuid of the VaSeBuilder run. This
+is used to the variant context file and the fastq files for a certain P-mode run.
+
+
+
+### Validation fastq files
+Validation fastq files are valid fastq files based on the provided template fastq files with acceptor and donor reads
+exchanged based on the variant contexts. Currently (September 24th, 2019) donor reads are added at the end of the
+validation fastq files.
+
+
+
+### Variant context file
+The variant context file is a tab separated file containing the essential data of the variant contexts in 13 columns.
+This file is one of the important output files as it contains the windows created to search reads and which acceptor
+reads are exchanged for which donor reads. The context data is preceded by a header starting with a #. The default
+output name for the variant context file is _varcon.txt_.
+
+Variant context file columns:
+* __ContextId:__ The identifier of the context (consists of the chromosome name and variant positions connected with
+an '_')
+* __DonorSample:__ The sample name/identifier of the variant from which the context is constructed.
+* __Chrom:__ The name of the chromosome the context is located on.
+* __Origin:__ This is the genomic position of the donor variant the acceptor, donor and variant context
+* __Start:__ The starting, or leftmost genomic, position of the variant context.
+* __End:__ The ending, or rightmost genomic, position of the variant context.
+* __AcceptorContextLength:__ The length of the acceptor context that constructed the variant context.
+* __DonorContextLength:__ The length of the donor context that constructed the variant context.
+* __AcceptorReads:__ The number of acceptor reads and mates overlapping with the variant context.
+* __DonorReads:__ The number of donor reads and mates overlapping with the variant context.
+* __ADratio:__ The ratio of acceptor reads over donor reads that will be exchanged.
+* __AcceptorReadIds:__ The read identifiers (one per mate pair) overlapping with the variant context.
+* __DonorReadsIds:__ The read identifiers (one per mate pair) overlapping with the variant context.
+
+The first line in each variant context file is the uuid of the VaSeBuilder run that constructed the variant contexts. A
+variant context file looks like:
+
+_#VBUUID: {uuid}
+\#ContextId DonorSample Chrom Origin Start End AcceptorContextLength
+ DonorContextLength AcceptorReads DonorReads ADratio AcceptorReadsIds
+ DonorReadIds
+1_100 Sample1 1 100 50 150 75 75 2 2 1.0
+ aReadId1,aReadId2 dReadId1,dReadId2_
+
+
+
+### Variant context statistics file
+The variant context statistics file contains some statistics about the reads and mates overlapping with the variant
+contexts. Per variant context the read statistics consists, in order, of the average and median read length, q-score
+and mapq values for acceptor and donor reads overlapping with the variant context.
+
+_#ContextId Avg_ALen Avg_DLen Med_ALen Med_DLen Avg_AQual Avg_DQual Med_AQual
+ Med_DQual Avg_AMapQ Avg_DMapQ Med_AMapQ Med_DMapQ
+1_100 151 151 151.0 151.0 36.5 35.6 37.0 38.0 54.6 55.7
+ 60.0 60.0_
+
+
+
+### Variant context bed file
+A BED file with the variant context entries in four columns: Chromosome name, context start, context end, context
+identifier. The output name for the BED file is _variantcontexts.bed_. The resulting file looks like:
+
+_1 100 1000 1_500
+2 200 2000 2_1000_
+
+
+
+## Debug output files
+When debug is set to True multiple extra files are outputted as well providing more in depth information.
+
+
+
+### Variant context acceptor/donor read positions file
+Much like the acceptor/donor read positions the variant context acceptor positions and variant context donor positions
+files save the leftmost R1 and rightmost R2 acceptor and donor read positions respectively.
+
+
+
+### Variant context unmapped acceptor/donor read mates file
+Records the identifiers of reads that have an unmapped read mate in a tab separated manner. The file contains three
+columns: Context identifier, sample name/identifier, read identifiers.
+
+_#ContextId SampleId ReadIds
+1_100 Sample1 uReadId1,uReadId2,uReadId3_
+
+
+### Acceptor/Donor contexts file
+The acceptor/donor context file is essentially a variant context file but for acceptor or donor contexts respectively.
+The acceptor/donor context file differs in the number of columns and only has two additional columns after ContextId -
+End, namely NumOfReads and ReadIds.
+
+_VBUUID: {uuid}
+\#ContextId DonorSample Chrom Origin Start End AcceptorContextLength
+ DonorContextLength NumOfReads ReadIds
+1_100 Sample1 1 100 50 150 2 dReadId1,dReadId2_
+
+
+
+### Acceptor/Donor context statistics file
+Similar to the variant context statistics file, the acceptor/donor context statistics file contains basic statistics of
+the reads overlapping with acceptor or donor contexts.
+
+_#ContextId &empsp;Avg_ReadLen MedReadLen AvgReadQual Med_ReadQual AvgReadMapQ
+ MedReadMapQ
+1_100 151 151.0 37.9 39.0 57.1 60.0_
+
+
+
+### Acceptor/Donor context read positions file
+The read positions contains the leftmost genomic positions of all R1 reads and rightmost genomic positions of all R2
+reads in all acceptor or donor contexts. These positions can be plotted, if needed, to gain more insight into the outer
+ranges of the reads associated with a variant context. The file contains three columns: context identifier, left
+positions, right positions. Each leftmost and rightmost position is separated by a comma.
+_#ContextId LeftPos RightPos
+1_100 25,50,75 75,50,25_
+
+
+
+### Acceptor/Donor context unmapped read mates file
+Reads overlapping with the acceptor/donor contexts with unmapped read mates encountered during a VaSeBuilder run are
+written the the acceptor_unmapped.txt or donor_unmapped.txt
+
diff --git a/tests/TestDonorBamRead.py b/tests/TestDonorBamRead.py
index 0c8a672..3a1f3f5 100644
--- a/tests/TestDonorBamRead.py
+++ b/tests/TestDonorBamRead.py
@@ -6,17 +6,25 @@ class TestDonorBamRead(unittest.TestCase):
# Creates the answer variables and the DonorBamRead used for testing
def setUp(self):
self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522"
+ self.read_flag_answer = "143"
self.read_pn_answer = "1"
self.read_chrom_answer = "21"
self.read_pos_answer = 9411193
self.read_len_answer = 151
+ self.read_end_answer = 9411331
+ self.read_cigarstring_answer = "13S138M"
+ self.read_rnext_answer = "21"
+ self.read_pnext_answer = 9411300
+ self.read_tlen_answer = 4398
self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \
"CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA"
self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \
">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>"
self.read_map_q_answer = 40
- self.dbam_read = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer,
- self.read_pos_answer, self.read_len_answer, self.read_seq_answer,
+ self.dbam_read = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer,
+ self.read_chrom_answer, self.read_pos_answer, self.read_len_answer,
+ self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer,
+ self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer,
self.read_quals_answer, self.read_map_q_answer)
# ====================PERFORM THE TESTS FOR THE GETTER METHODS====================
@@ -24,12 +32,16 @@ def test_get_bam_read_id(self):
self.assertEqual(self.dbam_read.get_bam_read_id(), self.read_id_answer, "Both BAM read identifiers should have "
f"been {self.read_id_answer}")
+ def tet_get_bam_read_flag(self):
+ self.assertEqual(self.dbam_read.get_bam_read_flag(), self.read_flag_answer,
+ f"Both read flag should have been {self.read_flag_answer}")
+
def test_get_bam_read_pair_number(self):
- self.assertEquals(self.dbam_read.get_bam_read_pair_number(), self.read_pn_answer, "Both read pair numbers "
+ self.assertEqual(self.dbam_read.get_bam_read_pair_number(), self.read_pn_answer, "Both read pair numbers "
f"should have been {self.read_pn_answer}")
def test_get_bam_read_chrom(self):
- self.assertEquals(self.dbam_read.get_bam_read_chrom(), self.read_chrom_answer, "Both read chromosomes should "
+ self.assertEqual(self.dbam_read.get_bam_read_chrom(), self.read_chrom_answer, "Both read chromosomes should "
f"have been {self.read_chrom_answer}")
def test_get_bam_read_ref_pos(self):
@@ -41,9 +53,28 @@ def test_get_bam_read_length(self):
f"been {self.read_len_answer}")
def test_get_bam_read_ref_end(self):
- read_end_answer = 9411344
- self.assertEqual(self.dbam_read.get_bam_read_ref_end(), read_end_answer, "Both read end positions should have "
- f"been {read_end_answer}")
+ self.assertEqual(self.dbam_read.get_bam_read_ref_end(), self.read_end_answer, "Both read end positions should have "
+ f"been {self.read_end_answer}")
+
+ def test_get_bam_read_cigar(self):
+ self.assertEqual(self.dbam_read.get_bam_read_cigar(), self.read_cigarstring_answer,
+ f"Both read CIGAR strings should have been {self.read_cigarstring_answer}")
+
+ def test_get_bam_read_rnext(self):
+ self.assertEqual(self.dbam_read.get_bam_read_rnext(), self.read_rnext_answer,
+ f"Both read RNEXT values should have been {self.read_rnext_answer}")
+
+ def test_get_bam_read_pnext(self):
+ self.assertEqual(self.dbam_read.get_bam_read_pnext(), self.read_pnext_answer,
+ f"Both read PNEXT values should have been {self.read_pnext_answer}")
+
+ def test_get_bam_read_tlen(self):
+ self.assertEqual(self.dbam_read.get_bam_read_tlen(), self.read_tlen_answer,
+ f"Both read TLEN values should have been {self.read_tlen_answer}")
+
+ def test_get_bam_read_sequence(self):
+ self.assertEqual(self.dbam_read.get_bam_read_sequence(), self.read_seq_answer,
+ f"Both read sequences should have been {self.read_seq_answer}")
def test_get_bam_read_qual(self):
self.assertEqual(self.dbam_read.get_bam_read_qual(), self.read_quals_answer, "Both read qualities should have "
@@ -64,6 +95,10 @@ def test_get_bam_read_mapq(self):
self.assertEqual(self.dbam_read.get_mapping_qual(), self.read_map_q_answer, "Both MapQ values should have been "
f"{self.read_map_q_answer}")
+ def test_read_has_hardclipped_bases(self):
+ self.assertFalse(self.dbam_read.read_has_hardclipped_bases(),
+ f"Read with {self.read_cigarstring_answer} should not have hard clipped bases.")
+
# ====================PERFORM THE TESTS FOR THE STATISTICS METHODS====================
def test_get_average_q_score(self):
avg_qscore_answer = 28.490066225165563
@@ -100,3 +135,11 @@ def test_get_as_fast_q_seq_nopairnum(self):
fastq_answer = f"@{self.read_id_answer}\n{self.read_seq_answer}\n+\n{self.read_quals_answer}\n"
self.assertEqual(self.dbam_read.get_as_fastq_seq(), fastq_answer, "Both answers should have been "
f"{fastq_answer}")
+
+ def test_get_as_bam_read(self):
+ bamcram_entry = f"{self.read_id_answer}\t{self.read_flag_answer}\t{self.read_chrom_answer}\t" \
+ f"{self.read_pos_answer}\t{self.read_map_q_answer}\t{self.read_cigarstring_answer}\t" \
+ f"{self.read_rnext_answer}\t{self.read_pnext_answer}\t{self.read_tlen_answer}\t{self.read_seq_answer}" \
+ f"{self.read_quals_answer}"
+ self.assertEqual(self.dbam_read.get_as_bam_read(), bamcram_entry,
+ f"The returned entry shoud have been {bamcram_entry}")
diff --git a/tests/TestOverlapContext.py b/tests/TestOverlapContext.py
index 6f12d6f..371ca54 100644
--- a/tests/TestOverlapContext.py
+++ b/tests/TestOverlapContext.py
@@ -8,23 +8,35 @@ class TestOverlapContext(unittest.TestCase):
def setUp(self):
# Create three DonorBamRead objects to add to the overlap context
self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522"
+ self.read_flag_answer = "143"
self.read_pn_answer = "1"
self.read_chrom_answer = "21"
self.read_pos_answer = 9411193
self.read_len_answer = 151
+ self.read_end_answer = 9411331
+ self.read_cigarstring_answer = "13S138M"
+ self.read_rnext_answer = "21"
+ self.read_pnext_answer = 9411300
+ self.read_tlen_answer = 4398
self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \
"CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA"
self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \
">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>"
self.read_mapq_answer = 40
- self.con_read1 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer,
- self.read_pos_answer, self.read_len_answer, self.read_seq_answer,
+ self.con_read1 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer,
+ self.read_chrom_answer, self.read_pos_answer, self.read_len_answer,
+ self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer,
+ self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer,
self.read_quals_answer, self.read_mapq_answer)
- self.con_read2 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer,
- self.read_pos_answer, self.read_len_answer, self.read_seq_answer,
+ self.con_read2 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer,
+ self.read_chrom_answer, self.read_pos_answer, self.read_len_answer,
+ self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer,
+ self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer,
self.read_quals_answer, self.read_mapq_answer)
- self.con_read3 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer,
- self.read_pos_answer, self.read_len_answer, self.read_seq_answer,
+ self.con_read3 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer,
+ self.read_chrom_answer, self.read_pos_answer, self.read_len_answer,
+ self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer,
+ self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer,
self.read_quals_answer, self.read_mapq_answer)
# Create the overlap context to test
@@ -36,7 +48,7 @@ def setUp(self):
self.context_end_answer = 9411344
self.context_len_answer = 151
self.context_bam_reads_answer = [self.con_read1, self.con_read2, self.con_read3]
- self.unmapped_answer = []
+ self.unmapped_answer = ["uread_1", "uread_2", "uread_3"]
self.overlap_context = OverlapContext(self.context_id_answer, self.context_sample_answer,
self.context_chrom_answer, self.context_origin_answer,
self.context_start_answer, self.context_end_answer,
@@ -55,6 +67,12 @@ def setUp(self):
self.avg_med_read_mapq_answer = [40, 40]
# ====================PERFORM THE TESTS FOR THE GETTER METHODS====================
+ def test_get_context(self):
+ context_answer = [self.context_chrom_answer, self.context_origin_answer, self.context_start_answer,
+ self.context_end_answer]
+ self.assertListEqual(self.overlap_context.get_context(), context_answer,
+ f"The returned context array should have been {context_answer}")
+
def test_get_context_id(self):
self.assertEqual(self.overlap_context.get_context_id(), self.context_id_answer, "The context id should have "
f"been {self.context_id_answer}")
@@ -148,10 +166,10 @@ def test_get_context_bam_read_qualities(self):
self.assertEqual(self.overlap_context.get_context_bam_read_qualities(), context_read_quals_answer, "The read "
f"qualites should have been {context_read_quals_answer}")
- #def test_getContextBamReadQScores(self):
- #contextReadQScoresAnswer = self.qscoreAnswer + self.qscoreAnswer + self.qscoreAnswer
- #self.assertListEqual(self.overlapContext.getContextBamReadQScores(), contextReadQScoresAnswer,
- # f"The Q-scores should have been {contextReadQScoresAnswer}")
+ def test_get_context_bam_read_q_scores(self):
+ context_read_q_scores_answer = [self.qscore_answer] + [self.qscore_answer] + [self.qscore_answer]
+ self.assertListEqual(self.overlap_context.get_context_bam_read_q_scores(), context_read_q_scores_answer,
+ f"The Q-scores should have been {context_read_q_scores_answer}")
def test_get_context_bam_read_map_qs(self):
context_read_map_qs_answer = [40, 40, 40]
@@ -167,6 +185,29 @@ def test_read_is_in_context_neg(self):
self.assertFalse(self.overlap_context.read_is_in_context(read_in_context_answer_neg), "Read id "
f"{read_in_context_answer_neg} should not have been in the context")
+ # ====================PERFORM THE TESTS FOR UNMAPPED MATE IDS====================
+ def test_add_unmapped_mate_id(self):
+ read_id_to_add = "uReadId1"
+ self.overlap_context.add_unmapped_mate_id(read_id_to_add)
+ self.assertTrue(read_id_to_add in self.overlap_context.unmapped_read_mate_ids,
+ f"Read id {read_id_to_add} should have been in the list of reads with unmapped mates")
+
+ def test_set_unmapped_mate_ids(self):
+ unmapped_read_ids = ["uReadId1", "uReadId2", "uReadId3"]
+ self.overlap_context.set_unmapped_mate_ids(unmapped_read_ids)
+ self.assertListEqual(self.overlap_context.unmapped_read_mate_ids, unmapped_read_ids,
+ f"The list of unmapped read mate ids should have been {unmapped_read_ids}")
+
+ def test_read_has_unmapped_mate_true(self):
+ read_id_to_search = "uread_1"
+ self.assertTrue(self.overlap_context.read_has_unmapped_mate(read_id_to_search),
+ f"Read {read_id_to_search} should have an unmapped mate")
+
+ def test_read_has_unmapped_mate_false(self):
+ read_id_to_search = "fReadId1"
+ self.assertFalse(self.overlap_context.read_has_unmapped_mate(read_id_to_search),
+ f"Read {read_id_to_search} should not have an unmapped mate")
+
# ====================PERFORM THE TESTS FOR THE OVERLAP CONTEXT STATISTICS====================
def test_get_average_and_median_read_length(self):
self.assertListEqual(self.overlap_context.get_average_and_median_read_length(), self.avg_med_read_len_answer,
diff --git a/tests/TestParamChecker.py b/tests/TestParamChecker.py
index e4c42a8..bb520d9 100644
--- a/tests/TestParamChecker.py
+++ b/tests/TestParamChecker.py
@@ -355,3 +355,55 @@ def test_required_parameters_set_invalidxcmodeparam(self):
set_parameters = {}
self.assertFalse(self.param_check.required_parameters_set("XC", set_parameters),
"The required parameters for XC mode should not have been ok and therefore return False")
+
+ # =====TEST SOME OTHER METHODS=====
+ def test_get_required_runmode_parameters_acmode(self):
+ ac_params_answer = ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", "out"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("AC"), ac_params_answer,
+ f"The list of required AC-mode parameters should have been {ac_params_answer}")
+
+ def test_get_required_runmode_parameters_dmode(self):
+ d_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("D"), d_params_answer,
+ f"The list of required D-mode parameters should have been {d_params_answer}")
+
+ def test_get_required_runmode_parameters_dcmode(self):
+ dc_params_answer = ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters(""), dc_params_answer,
+ f"The list of required DC-mode parameters should have been {dc_params_answer}")
+
+ def test_get_required_runmode_parameters_fmode(self):
+ f_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "templatefq1", "templatefq2", "out",
+ "reference"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("F"), f_params_answer,
+ f"The list of required F-mode parameters should have been {f_params_answer}")
+
+ def test_get_required_runmode_parameters_fcmode(self):
+ fc_params_answer = ["runmode", "donorvcf", "donorbam", "templatefq1", "templatefq2", "out", "reference",
+ "varconin"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("FC"), fc_params_answer,
+ f"the list of required FC-mode parameters should have been {fc_params_answer}")
+
+ def test_get_required_runmode_parameters_pmode(self):
+ p_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("P"), p_params_answer,
+ f"The list of required parameters should have been {p_params_answer}")
+
+ def test_get_required_runmode_parameters_pcmode(self):
+ pc_params_answer = ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("PC"), pc_params_answer,
+ f"The list of required parameters should have been {pc_params_answer}")
+
+ def test_get_required_runmode_parameters_xmode(self):
+ x_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("X"), x_params_answer,
+ f"The list of required parameters should have been {x_params_answer}")
+
+ def test_get_required_runmode_parameters_nonexistent(self):
+ self.assertListEqual(self.param_check.get_required_runmode_parameters("T"), [],
+ f"The list of required parameters for a non existent mode should have been empty.")
+
+ def test_get_optional_parameters(self):
+ opt_params_answer = ["fastqout", "varcon", "variantlist", "seed"]
+ self.assertListEqual(self.param_check.get_optional_parameters(), opt_params_answer,
+ f"The list of optional parameters should have been {opt_params_answer}")
diff --git a/tests/TestVaSeBuilder.py b/tests/TestVaSeBuilder.py
index e2c977f..b30edcd 100644
--- a/tests/TestVaSeBuilder.py
+++ b/tests/TestVaSeBuilder.py
@@ -6,6 +6,7 @@
from unittest.mock import patch, mock_open, call
import os
import sys
+import random
import pysam
# Import required class
@@ -26,6 +27,7 @@ def setUp(self):
self.filter_to_use = ["aap", "noot", "mies"]
self.vcfvarlist = []
+ # ====================PERFORM THE TESTS FOR THE FILTER METHOD====================
# Tests that a value is indeed in the inclusion filter
def test_passes_filter_posincl(self):
val_to_use = "noot"
@@ -58,7 +60,7 @@ def test_get_sample_vcf_variants_pos(self):
variant_list_answer.sort()
obtained_variant_list = self.vs_builder.get_sample_vcf_variants(vcffile)
- obtained_list = [x.to_strng() for x in obtained_variant_list]
+ obtained_list = [x.to_string() for x in obtained_variant_list]
obtained_list.sort()
self.assertListEqual(obtained_list, variant_list_answer, "The obtained list of reads should have been "
@@ -83,7 +85,7 @@ def test_determine_variant_type_indel(self):
# Tests that an indel is determined correctly
def test_determine_variant_type_indel2(self):
variant_ref = "C,CGATC"
- variant_alts = ("C")
+ variant_alts = ("C",)
variant_type_answer = "indel"
self.assertEqual(self.vs_builder.determine_variant_type(variant_ref, variant_alts), variant_type_answer,
"The determined variant type should have been an indel")
@@ -177,24 +179,27 @@ def test_is_required_read_neg(self):
"Should have been false for forward read")
# Tests setting the fastq output path for the F (R1) file
- def test_setFastqOutPath(self):
- output_path = "/aap/noot/mies"
- fastq_out_name = ""
+ def test_set_fastq_out_path_r1(self):
+ outprefix = "/test/outdata/VaSe"
+ lanenum = 3
+ outpath_answer = f"{outprefix}_{datetime.now().date()}_L{lanenum}_R1.fastq"
+ self.assertEqual(self.vs_builder.set_fastq_out_path(outprefix, '1', lanenum), outpath_answer,
+ f"The returned output path for R1 should have been {outpath_answer}")
+
+ def test_set_fastq_out_path_r2(self):
+ outprefix = "/test/outdata/VaSe"
+ lanenum = 3
+ outpath_answer = f"{outprefix}_{datetime.now().date()}_L{lanenum}_R2.fastq"
+ self.assertEqual(self.vs_builder.set_fastq_out_path(outprefix, '2', lanenum), outpath_answer,
+ f"The returned output path for R2 should have been {outpath_answer}")
# Tests that the creation id of the VaSeBuilder object is set correctly
- def test_getCreationId(self):
+ def test_get_creation_id(self):
creationid_answer = "piet"
vasebuilder_obj = VaSeBuilder(creationid_answer)
self.assertEqual(vasebuilder_obj.get_creation_id(), creationid_answer,
f"The returned VaSeBuilder identifier should have been: {creationid_answer}")
- # Tests that the creation date of the VaSeBuilder is set correctly.
- def test_get_creation_date(self):
- creation_data_answer = datetime.now().date()
- vasebuilder_obj = VaSeBuilder("aap")
- self.assertEqual(vasebuilder_obj.get_creation_date(), creation_data_answer,
- f"The returned VaSeBuilder creation data should have been: {creation_data_answer}")
-
# Tests that a saved context for a specified variant is indeed returned.
def test_get_variant_context_pos(self):
self.vs_builder.variantContextMap = self.var_con_map
@@ -211,22 +216,109 @@ def test_get_variant_context_neg(self):
# =
# Tests that a list of donor is divided without a remainder list
- def test_divide_donorfastqs_over_acceptors_noremainder(self):
+ def test_divide_donorfastqs_over_acceptors_equal(self):
donorfq_list = ['aR1.fq', 'bR1.fq', 'cR1.fq', 'dR1.fq', 'eR1.fq', 'fR1.fq']
- divlist_answer = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq', 'fR1.fq']]
+ divlist_answer = [['fR1.fq', 'cR1.fq'], ['eR1.fq', 'bR1.fq'], ['dR1.fq', 'aR1.fq']]
self.assertListEqual(self.vs_builder.divide_donorfastqs_over_acceptors(donorfq_list, 3), divlist_answer,
f"The returned divided donor fastq list should have been {divlist_answer}")
# Tests that a list of donors is divided with a remainder list
- def test_divide_donorfastqs_over_acceptors_withremainder(self):
+ def test_divide_donorfastqs_over_acceptors_unequal(self):
donorfq_list = ['aR1.fq', 'bR1.fq', 'cR1.fq', 'dR1.fq', 'eR1.fq']
- divlist_answer = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq']]
+ divlist_answer = [['eR1.fq', 'cR1.fq', 'aR1.fq'], ['dR1.fq', 'bR1.fq']]
self.assertListEqual(self.vs_builder.divide_donorfastqs_over_acceptors(donorfq_list, 2), divlist_answer,
f"The returned divided donor fastq list should have been {divlist_answer}")
- # Tests that the remainders are added to the proper lists.
- def test_divide_remaining_donors(self):
- divdonors = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq']]
- cordivdonors = [['aR1.fq', 'bR1.fq', 'eR1.fq'], ['cR1.fq', 'dR1.fq']]
- self.assertListEqual(self.vs_builder.divide_remaining_donors(divdonors), cordivdonors,
- f"The correct divided donor fastq list should have been {cordivdonors}")
+ # ====================PERFORM TESTS FOR THE SHUFFLING METHODS====================
+ # Tests that the shuffling of donor read works as expected
+ def test_shuffle_donor_read_identifiers(self):
+ read_ids = ["Read4", "Read3", "Read5", "Read1", "Read2"]
+
+ shuffled_answer = ["Read3", "Read2", "Read4", "Read5", "Read1"]
+ shuffled_answer = read_ids.copy()
+ shuffled_answer.sort()
+ random.seed(2)
+ random.shuffle(shuffled_answer)
+
+ self.assertListEqual(self.vs_builder.shuffle_donor_read_identifiers(read_ids, 2), shuffled_answer,
+ f"The correct shuffled donor read ids should have been {shuffled_answer}")
+
+ def test_shuffle_donor_add_positions(self):
+ add_positions = [6, 8, 2, 0, 1]
+ shuffled_answer = add_positions.copy()
+
+ def test_get_saved_insert_position_r1(self):
+ read_data = ('1', 100, '2', 100)
+ r1pos_answer = 100
+ self.assertEqual(self.vs_builder.get_saved_insert_position('1', read_data), r1pos_answer,
+ f"The R1 read insert position should have been {r1pos_answer}")
+
+ def test_get_saved_insert_position_r2(self):
+ read_data = ('1', 100, '2', 150)
+ r2pos_answer = 150
+ self.assertEqual(self.vs_builder.get_saved_insert_position('2', read_data), r2pos_answer,
+ f"The R2 read insert position should have been {r2pos_answer}")
+
+ def test_get_saved_insert_position_nor1(self):
+ read_data = ('2', 150)
+ r1pos_answer = 'NA'
+ self.assertEqual(self.vs_builder.get_saved_insert_position('1', read_data), r1pos_answer,
+ "The R1 read insert position should have been NA")
+
+ def test_get_saved_insert_position_nor2(self):
+ read_data = ('1', 100)
+ r2pos_answer = 'NA'
+ self.assertEqual(self.vs_builder.get_saved_insert_position('2', read_data), r2pos_answer,
+ "The R2 read insert position should have been NA")
+
+ # Adds a donor insert data position for a new fastq
+ def test_add_donor_insert_data_addnewfq(self):
+ donor_add_data = {}
+ fq_name = "ifq.fastq"
+ read_id = "iReadId1"
+ read_pairnum = "1"
+ insert_pos = 100
+ donor_add_data_answer = {"ifq.fastq": [("1", 100)]}
+ self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data)
+ self.assertDictEqual(donor_add_data, donor_add_data_answer,
+ f"Both donor read add data should have been {donor_add_data_answer}")
+
+ # Tests adding a new donor insert position data for an existing fastq
+ def test_add_donor_insert_data_addnewread(self):
+ donor_add_data = {"ifq.fastq": []}
+ fq_name = "ifq.fastq"
+ read_id = "iReadId1"
+ read_pairnum = "1"
+ insert_pos = 100
+ donor_add_data_answer = {"ifq.fastq": [("1", 100)]}
+ self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data)
+ self.assertDictEqual(donor_add_data, donor_add_data_answer,
+ f"Both donor read add data should have been {donor_add_data_answer}")
+
+ def test_add_donor_insert_data_addadditional(self):
+ donor_add_data = {"ifq.fastq": [("1", 100)]}
+ fq_name = "ifq.fastq"
+ read_id = "iReadId1"
+ read_pairnum = "2"
+ insert_pos = 100
+ donor_add_data_answer = {"ifq.fastq": [("1", 100, "2", 100)]}
+ self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data)
+ self.assertDictEqual(donor_add_data, donor_add_data_answer,
+ f"Both donor read add data should have been {donor_add_data_answer}")
+
+ def test_link_donor_addpos_reads_v2(self):
+ addposlist = [6, 8]
+ readidlist = ['dRead1', 'dRead2']
+ donor_reads = [('dRead1', '1', 'ACAT', '<<>>'), ('dRead1', '2', 'TACA', '>><<'),
+ ('dRead2', '1', 'CAAT', '<><>'), ('dRead2', '2', 'TAAC', '><><')]
+ donor_addpos_link_answer = {6: [('dRead1', '1', 'ACAT', '<<>>'), ('dRead1', '2', 'TACA', '>><<')],
+ 8: [('dRead2', '1', 'CAAT', '<><>'), ('dRead2', '2', 'TAAC', '><><')]}
+ self.assertDictEqual(self.vs_builder.link_donor_addpos_reads_v2(addposlist, readidlist, donor_reads),
+ donor_addpos_link_answer,
+ f"Both donor read/add position link maps should have been {donor_addpos_link_answer}")
+
+ def test_check_template_size(self):
+ num_of_template_reads = 56732
+ obtained_template_size = self.vs_builder.check_template_size("testdata/fqDir/SRR1039513_1.fastq.gz")
+ self.assertEqual(obtained_template_size, num_of_template_reads,
+ f"The number of template reads was expected to be {num_of_template_reads}")
diff --git a/tests/TestVariantContext.py b/tests/TestVariantContext.py
index c24d87f..265e332 100644
--- a/tests/TestVariantContext.py
+++ b/tests/TestVariantContext.py
@@ -9,22 +9,33 @@ def setUp(self):
# Create the variables saving the context BAM reads
self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522"
self.read_donor_id_answer = "HHKY2CCXX160108:1:2122:24160:2555"
+ self.read_flag_answer = "143"
self.read_pn_answer = "1"
self.read_donor_pn_answer = "2"
self.read_chrom_answer = "21"
self.read_pos_answer = 9411193
self.read_len_answer = 151
self.read_donor_len_answer = 108
+ self.read_end_answer = 9411334
+ self.read_cigarstring_answer = "13S138M"
+ self.read_donor_cigarstring_answer = "43H108M"
+ self.read_rnext_answer = "21"
+ self.read_pnext_answer = 9511300
+ self.read_tlen_answer = 400
self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \
"CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA"
self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \
">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>"
self.read_map_q_answer = 40
- self.acceptor_read = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer,
- self.read_pos_answer, self.read_len_answer, self.read_seq_answer,
+ self.acceptor_read = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer,
+ self.read_chrom_answer, self.read_pos_answer, self.read_len_answer,
+ self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer,
+ self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer,
self.read_quals_answer, self.read_map_q_answer)
- self.donor_read = DonorBamRead(self.read_donor_id_answer, self.read_donor_pn_answer, self.read_chrom_answer,
- self.read_pos_answer, self.read_donor_len_answer, self.read_seq_answer,
+ self.donor_read = DonorBamRead(self.read_donor_id_answer, self.read_flag_answer, self.read_donor_pn_answer,
+ self.read_chrom_answer, self.read_pos_answer, self.read_donor_len_answer,
+ self.read_end_answer, self.read_donor_cigarstring_answer, self.read_rnext_answer,
+ self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer,
self.read_quals_answer, self.read_map_q_answer)
self.acceptor_read_ids_answer = ["HHKY2CCXX160108:1:2122:24160:2522"]
diff --git a/tests/TestVcfVariant.py b/tests/TestVcfVariant.py
index 911ea80..768f5e2 100644
--- a/tests/TestVcfVariant.py
+++ b/tests/TestVcfVariant.py
@@ -8,7 +8,7 @@ def setUp(self):
self.variant_pos_answer = 100
self.variant_ref_answer = "C"
self.variant_alts_answer = ("A", "G")
- self.variant_filter_answer = ["PASS"]
+ self.variant_filter_answer = "PASS"
self.variant_type_answer = "snp"
self.variant_id_answer = f"{self.variant_chrom_answer}_{self.variant_pos_answer}"
self.vcfvariant_obj_answer = VcfVariant(self.variant_chrom_answer, self.variant_pos_answer,
@@ -58,14 +58,14 @@ def test_set_variant_alt_alleles(self):
f"alleles should have been: {varalts_toset}")
def test_get_variant_filter(self):
- self.assertListEqual(self.vcfvariant_obj_answer.get_variant_filter(), self.variant_filter_answer, "Both variant"
- f" filters should have been: {self.variant_filter_answer}")
+ self.assertEqual(self.vcfvariant_obj_answer.get_variant_filter(), self.variant_filter_answer,
+ f"Both variant filters should have been: {self.variant_filter_answer}")
def test_set_variant_filter(self):
- varfilter_touse = ['q10']
+ varfilter_touse = "q10"
self.vcfvariant_obj_answer.set_variant_filter(varfilter_touse)
- self.assertListEqual(self.vcfvariant_obj_answer.get_variant_filter(), varfilter_touse, "The set variant filter "
- f"should have been: {varfilter_touse}")
+ self.assertEqual(self.vcfvariant_obj_answer.get_variant_filter(), varfilter_touse, "The set variant filter "
+ f"should have been: {varfilter_touse}")
def test_get_variant_type(self):
self.assertEqual(self.vcfvariant_obj_answer.get_variant_type(), self.variant_type_answer, "Both variant types "
diff --git a/vase.py b/vase.py
index 05d328a..95d70e0 100644
--- a/vase.py
+++ b/vase.py
@@ -12,36 +12,53 @@
from ParamChecker import ParamChecker
from VcfBamScanner import VcfBamScanner
from VaSeBuilder import VaSeBuilder
+from VariantContextFile import VariantContextFile
class VaSe:
- # Performs the check that VaSe is run with Python 3.x
def __init__(self):
+ """Checks both the python and pysam version.
+
+ If the python version is not at least 3.6 or higher the program will not run as f-strings, that are used in this
+ program were not available in older versions. The program also requires pysam 0.15 or higher as some pysma
+ functions used in VaseBuilder are only available since pysam 0.15.
+
+ Attributes
+ ----------
+ valid_runmodes : list of str
+ Valid VaSeBuilder run modes
+ """
assert (sys.version_info[0] >= 3 and sys.version_info[1] >= 6), "Please run this program in Python 3.6 or " \
"higher"
assert (int(pysam.version.__version__.split(".")[0]) >= 0 and int(pysam.version.__version__.split(".")[1]) >=
15), "Please run this program with Pysam 0.15 or higher"
- self.valid_runmodes = ["A", "AC", "D", "DC", "F", "FC", "P", "PC", "X", "XC"]
+ self.valid_runmodes = ["AC", "D", "DC", "F", "FC", "P", "PC", "X"]
# Runs the program.
def main(self):
+ """Runs VaSeBuilder and performs all the work.
+ """
# Parse the command line parameters and check their validity.
vase_arg_list = self.get_vase_parameters()
pmc = ParamChecker()
- # Start the logger and initialize this run with an ID number.
- self.vaselogger = self.start_logger(pmc, vase_arg_list["log"], vase_arg_list["debug"])
-
- vase_called_command = " ".join(sys.argv)
- self.vaselogger.info(f"python {vase_called_command}")
- vase_b = VaSeBuilder(uuid.uuid4().hex)
# Check whether a configuration file was supplied than all required command line parameters.
if vase_arg_list["configfile"] is not None:
configfileloc = vase_arg_list["configfile"]
vase_arg_list = self.read_config_file(configfileloc) # Set the read config file as the parameter list
+ # Check whether the DEBUG parameters has been set or not
+ if "debug" not in vase_arg_list:
+ vase_arg_list["debug"] = False
# Check optional parameters and set those missing to None
vase_arg_list = pmc.optional_parameters_set(vase_arg_list)
+ # Start the logger and initialize this run with an ID number.
+ self.vaselogger = self.start_logger(pmc, vase_arg_list["log"], vase_arg_list["debug"])
+
+ vase_called_command = " ".join(sys.argv)
+ self.vaselogger.info(f"python {vase_called_command}")
+ vase_b = VaSeBuilder(uuid.uuid4().hex)
+
# Exit if not all of the required parameters have been set
if not pmc.required_parameters_set(vase_arg_list["runmode"], vase_arg_list):
self.vaselogger.critical("Not all required parameters have been set")
@@ -52,48 +69,13 @@ def main(self):
self.vaselogger.critical("Not all required parameters are correct. Please check log for more info.")
exit()
- if "A" in pmc.runmode:
- donor_fastq_files = self.read_donor_fastq_list_file(pmc.get_donorfqlist())
- r1_dfqs = [dfq[0] for dfq in donor_fastq_files]
- r2_dfqs = [dfq[1] for dfq in donor_fastq_files]
- vase_b.build_validation_from_donor_fastqs(pmc.get_first_fastq_in_location(),
- pmc.get_second_fastq_in_location(),
- r1_dfqs, r2_dfqs, pmc.varconin, pmc.get_fastq_out_location())
- else:
- # Scan the variant and alignment files in the provided lists.
- vbscan = VcfBamScanner()
- vcf_file_map = vbscan.scan_vcf_files(vase_arg_list["donorvcf"])
- bam_file_map = vbscan.scan_bamcram_files(vase_arg_list["donorbam"])
- sample_id_list = vbscan.get_complete_sample_ids()
+ # Check if a variantfilter has been set.
+ variantfilter = None
+ if pmc.get_variant_list_location() != "":
+ variantfilter = self.read_variant_list(pmc.get_variant_list_location())
- # Exit if no valid pairs of variant/alignment files are found.
- if not sample_id_list:
- self.vaselogger.critical("No valid samples available to "
- "create new validation set")
- exit()
-
- variantfilter = None
- if pmc.get_variant_list_location() != "":
- variantfilter = self.read_variant_list(pmc.get_variant_list_location())
-
- if "C" not in pmc.runmode:
- vase_b.build_varcon_set(sample_id_list,
- vcf_file_map, bam_file_map,
- pmc.get_acceptor_bam(),
- pmc.get_out_dir_location(),
- pmc.get_reference_file_location(),
- pmc.get_variant_context_out_location(),
- variantfilter)
- elif "C" in pmc.runmode:
- vase_b.build_donor_from_varcon(pmc.varconin,
- bam_file_map,
- pmc.get_reference_file_location())
- if "X" not in pmc.runmode:
- vase_b.build_validation_set(pmc.runmode,
- pmc.get_acceptor_bam(),
- pmc.get_first_fastq_in_location(),
- pmc.get_second_fastq_in_location(),
- pmc.get_fastq_out_location())
+ # Run the selected mode.
+ self.run_selected_mode(pmc.get_runmode(), vase_b, pmc, variantfilter, pmc.get_random_seed_value())
self.vaselogger.info("VaSeBuilder run completed succesfully.")
elapsed = time.strftime(
@@ -102,9 +84,16 @@ def main(self):
)
self.vaselogger.info(f"Elapsed time: {elapsed}.")
- # Method that creates the logger that will write the log to stdout
- # and a log file.
def start_logger(self, paramcheck, logloc, debug_mode=False):
+ """Starts and returns the logger VaSe_Logger.
+
+ The logger writes both to stdout and the specified logfile.
+
+ Returns
+ -------
+ vaselogger : Logger
+ Logging utility to log VaSeBuilder activity
+ """
vaselogger = logging.getLogger("VaSe_Logger")
if debug_mode:
vaselogger.setLevel(logging.DEBUG)
@@ -135,31 +124,68 @@ def start_logger(self, paramcheck, logloc, debug_mode=False):
vaselogger.addHandler(vase_file_handler)
return vaselogger
- # Returns the vase Parameters.
def get_vase_parameters(self):
+ """Creates a command line argument parser and returns the parameter values.
+
+ Returns
+ -------
+ vase_args : dict
+ Command line parameters and set values
+ """
# Set the VaSe parameters for the program.
vase_argpars = argparse.ArgumentParser()
- vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes, help="RUNMODE HELP")
- vase_argpars.add_argument("-v", "--donorvcf", dest="donorvcf", help="File containing a list of VCF/VCF.GZ/BCF files.")
+ vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes,
+ help="RUNMODE HELP")
+ vase_argpars.add_argument("-v", "--donorvcf", dest="donorvcf",
+ help="File containing a list of VCF/VCF.GZ/BCF files.")
vase_argpars.add_argument("-b", "--donorbam", dest="donorbam", help="File containing a list of BAM/CRAM files.")
- vase_argpars.add_argument("-a", "--acceptorbam", dest="acceptorbam", help="BAM file for identifying acceptor reads to exclude.")
- vase_argpars.add_argument("-1", "--templatefq1", dest="templatefq1", nargs="*", help="Location and name of the first fastq in file.")
- vase_argpars.add_argument("-2", "--templatefq2", dest="templatefq2", nargs="*", help="Location and name of the second fastq in file.")
+ vase_argpars.add_argument("-a", "--acceptorbam", dest="acceptorbam",
+ help="BAM file for identifying acceptor reads to exclude.")
+ vase_argpars.add_argument("-1", "--templatefq1", dest="templatefq1", nargs="*",
+ help="Location and name of the first fastq in file.")
+ vase_argpars.add_argument("-2", "--templatefq2", dest="templatefq2", nargs="*",
+ help="Location and name of the second fastq in file.")
vase_argpars.add_argument("-o", "--out", dest="out", help="Directory to write output files to.")
- vase_argpars.add_argument("-r", "--reference", dest="reference", help="Location of the reference genome. This reference genome should be used by all VCF/BCF and BAM/CRAM files.")
- vase_argpars.add_argument("-of", "--fastqout", dest="fastqout", help="Name for the two FastQ files to be produced.")
- vase_argpars.add_argument("-ov", "--varcon", dest="varcon", help="File name to write variants and their contexts to.")
- vase_argpars.add_argument("-l", "--log", dest="log", help="Location to write log files to (will write to working directory if not used).")
- vase_argpars.add_argument("-!", "--debug", dest="debug", action="store_true", help="Run the program in debug mode")
- vase_argpars.add_argument("-vl", "--variantlist", dest="variantlist", help="File containing a list of variants to use. Will only use these variants if provided. Will use all variants if no list is provided.")
- vase_argpars.add_argument("-iv", "--varconin", dest="varconin", help="Provide a Vasebuilder output variant context file to build a validation set.")
- vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", help="Location to donor fastq list file")
+ vase_argpars.add_argument("-r", "--reference", dest="reference",
+ help="Location of the reference genome. This reference genome should be used by all"
+ "VCF/BCF and BAM/CRAM files.")
+ vase_argpars.add_argument("-of", "--fastqout", dest="fastqout",
+ help="Name for the two FastQ files to be produced.")
+ vase_argpars.add_argument("-ov", "--varcon", dest="varcon",
+ help="File name to write variants and their contexts to.")
+ vase_argpars.add_argument("-l", "--log", dest="log",
+ help="Location to write log files to (will write to working directory if not used).")
+ vase_argpars.add_argument("-!", "--debug", dest="debug", action="store_true",
+ help="Run the program in debug mode")
+ vase_argpars.add_argument("-vl", "--variantlist", dest="variantlist",
+ help="File containing a list of variants to use. Will only use these variants if "
+ "provided. Will use all variants if no list is provided.")
+ vase_argpars.add_argument("-iv", "--varconin", dest="varconin",
+ help="Provide a Vasebuilder output variant context file to build a validation set.")
+ vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs",
+ help="Location to donor fastq list file")
vase_argpars.add_argument("-c", "--config", dest="configfile", help="Supply a config file")
+ vase_argpars.add_argument("-s", "--seed", dest="seed", default=2, type=int,
+ help="Set seed for semi randomly distributing donor reads")
vase_args = vars(vase_argpars.parse_args())
return vase_args
- # Reads the variant list. Assumes that sampleId, chromosome and startpos are columns 1,2 and 3
def read_variant_list(self, variantlistloc):
+ """Reads a file containing genomic variants and returns them in a dictionary.
+
+ The file containing the variant is expected to have at least three columns separated by tabs. These should be,
+ in order: sample name, chromosome name, chromosomal position.
+
+ Parameters
+ ----------
+ variantlistloc : str
+ The location of the file containing variants
+
+ Returns
+ -------
+ dict
+ Read variants per sample name
+ """
variant_filter_list = {}
try:
with open(variantlistloc) as variantlistfile:
@@ -174,8 +200,20 @@ def read_variant_list(self, variantlistloc):
finally:
return variant_filter_list
- # Reads the config file with the settings
def read_config_file(self, configfileloc):
+ """Reads a VaSeBuilder configuration file and returns the parameter values.
+
+ Parameters
+ ----------
+ configfileloc : str
+ Path to the VaSeBuilder configuration file
+
+ Returns
+ -------
+ configdata : dict
+ Read parameters and values
+ """
+ debug_param_vals = ["True", "1", "T"]
configdata = {}
try:
with open(configfileloc, "r") as configfile:
@@ -191,6 +229,11 @@ def read_config_file(self, configfileloc):
if parameter_name == "templatefq1" or parameter_name == "templatefq2":
template_files = parameter_value.split(",")
configdata[parameter_name] = [tmplfile.strip() for tmplfile in template_files]
+ # Check if the parameter is the debug parameter
+ elif parameter_name == "debug":
+ configdata[parameter_name] = parameter_value.title() in debug_param_vals
+ elif parameter_name == "seed":
+ configdata[parameter_name] = int(parameter_value)
else:
configdata[parameter_name] = parameter_value.strip()
except IOError:
@@ -199,6 +242,19 @@ def read_config_file(self, configfileloc):
# Reads a list of donor fastq files in the format (R1.fq\tR2.fq)
def read_donor_fastq_list_file(self, donorfq_listfileloc):
+ """Reads a file with a list of donor fastq files
+
+ The donor fastq list file is expected to have two columns. The first column should contain the paths to R1 files
+ and the second column paths to R2 files. For each sample two fastq files are expected,
+
+ Parameters
+ ----------
+ donorfq_listfileloc : str
+ Path to the donor fastq list file
+
+ Returns
+ -------
+ """
donor_fastqs = []
try:
with open(donorfq_listfileloc) as donorfqlistfile:
@@ -209,6 +265,72 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc):
finally:
return donor_fastqs
+ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter, randomseed):
+ """Selects and runs the selected run mode.
+
+ Depending on the specified run mode either a full validation set of fastq files or fastq files containing only
+ donor data are produced. If the run mode contains a 'C' an existing variant context file will be read,
+ otherwise it will be build.
+
+ Parameters
+ ----------
+ runmode : str
+
+ vaseb : VaSeBuilder
+ VaSeBuilder object that will perform the actions
+ paramcheck : ParamChecker
+ Utility tha checks whether the parameters are ok
+ variantfilter
+ """
+ vbscan = VcfBamScanner()
+ varconfile = None # Declare the varconfile variable so we can use it in C and no-C.
+
+ if "X" in runmode:
+ vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist())
+ bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist())
+ sample_id_list = vbscan.get_complete_sample_ids()
+ vaseb.run_x_mode(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(),
+ paramcheck.get_reference_file_location(), paramcheck.get_out_dir_location(),
+ paramcheck.get_variant_context_out_location(), variantfilter)
+ else:
+ if "C" in runmode:
+ # Read an existing variant context file
+ varconfile = VariantContextFile(paramcheck.get_variantcontext_infile())
+ if "A" in runmode:
+ donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist())
+ vaseb.run_ac_mode_v2(paramcheck.get_first_fastq_in_location(),
+ paramcheck.get_second_fastq_in_location(),
+ donor_fastq_files, varconfile, randomseed, paramcheck.get_fastq_out_location())
+ return
+ # Refetch the donor reads required when runmode (D,F,P) contains a 'C'
+ bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist())
+ vaseb.refetch_donor_reads(varconfile, bam_file_map, paramcheck.get_reference_file_location())
+ else:
+ # Scan the variant and alignment files in the provided lists.
+ vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist())
+ bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist())
+ sample_id_list = vbscan.get_complete_sample_ids()
+ # varconfile = vaseb.build_varcon_set(sample_id_list, vcf_file_map, bam_file_map,
+ # paramcheck.get_acceptor_bam(), paramcheck.get_out_dir_location(),
+ # paramcheck.get_reference_file_location(),
+ # paramcheck.get_variant_context_out_location(), variantfilter)
+ varconfile = vaseb.bvcs(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(),
+ paramcheck.get_out_dir_location(), paramcheck.get_reference_file_location(),
+ paramcheck.get_variant_context_out_location(), variantfilter)
+
+ # Check whether contexts were created
+ if varconfile is None:
+ return
+
+ # Check for modes D,F,P
+ if "D" in runmode:
+ vaseb.run_d_mode(varconfile, paramcheck.get_fastq_out_location())
+ if "F" in runmode:
+ vaseb.run_f_mode(varconfile, paramcheck.get_first_fastq_in_location(),
+ paramcheck.get_second_fastq_in_location(), paramcheck.get_fastq_out_location())
+ if "P" in runmode:
+ vaseb.run_p_mode(varconfile, paramcheck.get_out_dir_location(), paramcheck.get_fastq_out_location())
+
# Run the program.
vaseb = VaSe()