diff --git a/DonorBamRead.py b/DonorBamRead.py index 5a9d38d..b501ca1 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -2,88 +2,312 @@ class DonorBamRead: - # Saves the required BAM read data. - def __init__(self, readid, readpn, readchrom, readstart, - readlen, readseq, readquals, mapqual): + """The DonorBamRead is used to save data from an aligned read extracted from a BAM or CRAM file. + + DonorBamRead can be used to save the data from aligned reads that were extracted from a BAM or CRAM file. + + Attributes + ---------- + bam_read_id : str + Read identifier + bam_read_flag : int + The read bitwise flag value + bam_read_pairnum : str + Read pair number (1 or 2) + bam_read_chrom : str + Chromosome name the read is located on + bam_read_ref_pos : int + Leftmost genomic position of the read + bam_read_length : int + Read length + bam_read_end_pos : int + Read rightmost position (determined by alignment) + bam_read_cigar : str + Read CIGAR string + bam_read_rnext : str + Chromosome name of the read mate + bam_read_pnext : int + Leftmost genomic position of the read mate + bam_read_tlen : int + Read TLEN value + bam_read_seq : str + Read sequence + bam_read_qual : str + Read ASCII quality + bam_read_map_qual : int + Read MAPQ value + """ + + def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, readend, readcigar, readrnext, + readpnext, readtlen, readseq, readquals, readmapq): + """Saves all required data of an aligned read from a BAM or CRAM file. + + Parameters + ---------- + readid : str + The read identifier + readflag : str + Aligned read bitwise flag + readpn : str + The read pair number + readchrom : str + The chromosome name the read is located on + readstart : int + The leftmost genomic position of the mapped read + readlen : int + The total length of the read + readcigar : str + Aligned read CIGAR string + readrnext : str + Chromsome name the read mate is located on + readpnext : int + Leftmost genomic position of the read mate + readtlen : int + Aligned read TLEN value + readseq : str + The read sequence + reqdquals : str + The qualities of the read in ascii + mapqual : int + The MAPQ mapping quality + """ self.bam_read_id = readid + self.bam_read_flag = readflag self.bam_read_pairnum = readpn self.bam_read_chrom = readchrom self.bam_read_ref_pos = readstart self.bam_read_length = readlen + self.bam_read_end_pos = readend + self.bam_read_cigar = readcigar + self.bam_read_rnext = readrnext + self.bam_read_pnext = readpnext + self.bam_read_tlen = readtlen self.bam_read_seq = readseq self.bam_read_qual = readquals - self.bam_read_map_qual = mapqual + self.bam_read_map_qual = readmapq # ===METHODS TO GET SAVED DATA FROM THE DONORBAMREAD======================= - # Returns the BAM read identifier. def get_bam_read_id(self): + """Returns the identifier of the read. + + Returns + ------- + self.bam_read_id : str + The read identifier + """ return self.bam_read_id - # Returns the BAM read pair number (1 or 2). + def get_bam_read_flag(self): + """Returns the read alignment flag. + + Returns + ------- + self.bam_read_flag : str + Read alignment flag + """ + def get_bam_read_pair_number(self): + """Returns the pair number of the read. + This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads. + + Returns + ------- + self.bam_read_pairnum : str + The read pair number + """ return self.bam_read_pairnum - # Returns the BAM read chromosome. def get_bam_read_chrom(self): + """Returns the name of the chromosome where the read has been mapped. + + Returns + ------- + self.bam_read_chrom : str + The chromosome name where the read is mapped + """ return self.bam_read_chrom - # Returns the BAM read starting position on the reference sequence. def get_bam_read_ref_pos(self): + """Returns the genomic mapping position (the left most position) of the read. + + Returns + ------- + self.bam_read_ref_pos : int + The read leftmost genomic position + """ return self.bam_read_ref_pos - # Returns the BAM read length. def get_bam_read_length(self): + """Returns the length of the read. In VaSeBuilder this length is calculated by means of the CIGAR string by + using the pysam method infer_read_length() + + Returns + ------- + self.bam_read_length : int + The length of the read + """ return self.bam_read_length - # Returns the BAM read ending position on the reference (calculated - # as starting position + the length of the read). def get_bam_read_ref_end(self): - if self.bam_read_length is not None: - return self.bam_read_ref_pos + self.bam_read_length - return -1 + """Returns the rightmost genomic position of the read. + + Returns + ------- + self.bam_read_end_pos : int + The rightmost position of the read + """ + return self.bam_read_end_pos + + def get_bam_read_cigar(self): + """Returns the read alignment CIGAR string + + Returns + ------- + self.bam_read_cigar : str + Read alignment CIGAR string + """ + return self.bam_read_cigar + + def get_bam_read_rnext(self): + """Returns the chromosome name the read mate is located on. + + Returns + ------- + self.bam_read_rnext : str + Chromosome name the read mate is located on + """ + return self.bam_read_rnext + + def get_bam_read_pnext(self): + """Returns the leftmost genomic position of the read mate. + + Returns + ------- + self.bam_read_pnext : ibnt + Leftmost genomic position of the read mate + """ + return self.bam_read_pnext + + def get_bam_read_tlen(self): + """Returns the read alignment TLEN value. + + Returns + ------- + self.bam_read_tlen : int + Read alignment TLEN value + """ - # Returns the BAM read sequence. def get_bam_read_sequence(self): + """Returns the read sequence as a String. + + Returns + ------- + self.bam_read_seq : str + The sequence of the read + """ return self.bam_read_seq - # Returns the BAM read quality scores. def get_bam_read_qual(self): + """Returns the read quality. + + Returns + ------- + self.bam_read_qual : str + Read quality + """ return self.bam_read_qual - # Returns the BAM read quality as an array of Q-Scores. def get_bam_read_q_scores(self): + """Converts the String of ASCII quality scores to a list Q-Scores. The Q-Score for each ASCII quality character + are calculated by obtaining the unicode code point and subtracting 33. + + Returns + ------- + qscores : list of int + Q-Score of the read + """ qscores = [] for qualSymbol in self.bam_read_qual: qscores.append(ord(qualSymbol)-33) return qscores - # Returns the maping quality of the BAM read. def get_mapping_qual(self): + """Returns the mapping quality that was assigned to the read. + + Returns + ------- + self.bam_read_map_qual : int + The mapping quality (MAPQ) + """ return self.bam_read_map_qual + def read_has_hardclipped_bases(self): + """Returns whether the read has hardclipped bases. + + The CIGAR string of the read is checked whether it contains an 'H' (Hardclipped bases). + + Returns + ------- + bool + True if read CIGAR contains 'H', False if not + """ + if self.bam_read_cigar is not None: + return "H" in self.bam_read_cigar + return False + # ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD=================== - # Returns the average Q-Score. def get_average_qscore(self): + """Calculates and returns the mean Q-Score of the read. + + Returns + ------- + float + The mean Q-Score + """ qscores = self.get_bam_read_q_scores() return statistics.mean(qscores) - # Returns the median Q-Score. def get_median_qscore(self): + """Calculates and returns the median Q-Score of the read. + + Returns + ------- + int + The median Q-Score of the read + """ qscores = self.get_bam_read_q_scores() return statistics.median(qscores) # ===METHODS TO CHECK WHETHER THE BAM READ IS R1 OR R2===================== - # Returns if the BAM read is the first (forward) read. def is_read1(self): + """Returns whether the read is the forward/R1 read by checking if the pair number is set to '1'. + + Returns + ------- + bool + True if the read has pair number 1, otherwise False + """ return self.bam_read_pairnum == "1" - # Returns if the BAM read is the second (reverse) read. def is_read2(self): + """Returns whether the read is the reverse/R2 read by checking if the pair number is set to '2'. + + Returns + ------- + bool + True if the read has pair number 2, otherwise False + """ return self.bam_read_pairnum == "2" # ===METHODS TO RETURN A STRING REPRESENTATION OF THE DONORBAMREAD OBJECT== - # Returns a String representation. def to_string(self): + """Returns a String with all the saved data, separated by tabs, of the read. + + Returns + ------- + str + String representation of the read. + """ return (str(self.bam_read_id) + "\t" + str(self.bam_read_pairnum) + "\t" + str(self.bam_read_chrom) + "\t" @@ -93,8 +317,17 @@ def to_string(self): + str(self.bam_read_qual) + "\t" + str(self.bam_read_map_qual)) - # Returns the BAM read as a fastq sequence. def get_as_fastq_seq(self, addpairnum=False): + """Returns the read as a FastQ entry. + + If the 'addpairnum' argument is set to True the read pair number will be added at the end of the read + identifier as '/1' or '/2', depending on whether the read is the forward/R1 or reverse/R2 read. + + Returns + ------- + str + The read as FastQ entry + """ if addpairnum: return ("@" + str(self.bam_read_id) + "/" + str(self.bam_read_pairnum) + "\n" + str(self.bam_read_seq) + "\n" @@ -105,3 +338,16 @@ def get_as_fastq_seq(self, addpairnum=False): + str(self.bam_read_seq) + "\n" + "+\n" + str(self.bam_read_qual) + "\n") + + def get_as_bam_read(self): + """Returns the read as a SAM/BAM/CRAM file entry. + + Returns + ------- + bamcram_entry : str + Read as SAM/BAM/CRAM entry + """ + bamcram_entry = f"{self.bam_read_id}\t{self.bam_read_flag}\t{self.bam_read_chrom}\t{self.bam_read_ref_pos}" \ + f"\t{self.bam_read_map_qual}t{self.bam_read_cigar}\t{self.bam_read_rnext}\t{self.bam_read_pnext}" \ + f"\t{self.bam_read_tlen}\t{self.bam_read_seq}\t{self.bam_read_qual}" + return bamcram_entry diff --git a/OverlapContext.py b/OverlapContext.py index a65e106..e0863ca 100644 --- a/OverlapContext.py +++ b/OverlapContext.py @@ -2,10 +2,52 @@ class OverlapContext: - # Saves the data associated with the context overlapping with the - # variant. + """The OverlapContext saves an acceptor or donor context. + + The OverlapContext can be used to save an acceptor or donor context with reads that overlap with the variant + directly. + + Attributes + ---------- + context_id : str + Identifier of the context + sample_id : str + Sample name/identifier the context was derived from + context_chrom : str + The chromosome name that the context is located on + context_origin : int + The variant genomic position that constructed the context + context_start : int + The leftmost genomic position of the context + context_end : int + The rightmost genomic position of the context + context_bam_reads : list of DonorBamRead + List of aligned reads + unmapped_read_mate_ids : list of str + List of read identifiers that have unmapped mates + """ + def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, ovconstart, ovconend, bamreads): + """Saves the required data for an acceptor or donor context. + + Parameters + ---------- + variantid : str + Variant context identifier + sampleid : str + Sample name/identifier + ovconchrom : str + Chromosome name the context is located on + ovconorigin : int + Variant genomic position the context is constructed from + ovconstart : int + Leftmost genomic position of the context + ovconend : int + Rightmost genomic position of the context + bamreads : list of DonorBamRead + Reads and read mates overlapping with the context + """ self.context_id = variantid self.sample_id = sampleid self.context_chrom = ovconchrom @@ -16,135 +58,307 @@ def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, self.unmapped_read_mate_ids = [] # ===METHODS TO GET DATA OF THE OVERLAP CONTEXT============================ - # Returns the context identifier. + def get_context(self): + """Returns the essential data of the context. + + This data consists of the chromosome name the context is located on, the variant position the context is based + on and the leftmost (start) and rightmost (end) genomic position of the context. + + Returns + ------- + list + List of essential context data + """ + return [self.context_chrom, self.context_origin, self.context_start, self.context_end] + def get_context_id(self): + """Returns the context identifier. + + Returns + ------- + self.context_id : str + The context identifier + """ return self.context_id - # Returns the sample identifier the context was based on. def get_sample_id(self): + """Returns the identifier of the sample the context is + + Returns + ------- + self.sample_id : str + The sample name/identifier + """ return self.sample_id - # Returns the context chromosome name the context is located on. def get_context_chrom(self): + """Returns the chromosome name that the context is located on. + + Returns + ------- + self.context_chrom : str + The context chromosome name + """ return self.context_chrom - # Returns the origin position of the context. def get_context_origin(self): + """Returns the the position of the variant that was used to construct the context. + + Returns + ------- + self.context_origin : int + Variant position the context is based on + """ return self.context_origin - # Returns the context start position. def get_context_start(self): + """Returns the left most genomic position of the context. + + Returns + ------- + self.context_start : int + Context starting position + """ return self.context_start - # Returns the end position of the context. def get_context_end(self): + """Returns the right most genomic position of the context. + + Returns + ------- + self.context_end : int + Context ending position + """ return self.context_end - # Returns the bam reads associated with the context as a list of - # BamRead objects. def get_context_bam_reads(self): + """Returns the list of reads and their mates overlapping with the context. + + Mates of reads overlapping with the context might not overlap with the context themselves. Each read is returned + as a DonorBamRead object.""" return self.context_bam_reads - # Returns the list of BAM read ids that have an unmapped mate. def get_unmapped_read_mate_ids(self): + """Returns a list with identifiers of reads that have an unmapped mate. + + Returns + ------- + self.unmapped_read_mate_ids : list of str + List with read identifiers that have unmapped mates + """ return self.unmapped_read_mate_ids - # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE============= - # ===OVERLAP CONTEXT=== - # Returns the lengt of the context. + # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE OVERLAP CONTEXT============= def get_context_length(self): + """Subtracts the left most genomic position of the context from the right most and returns the difference. + + Returns + ------- + int + Context length + """ return abs(self.context_end - self.context_start) - # Returns the distance of the context start from the context origin. def get_start_distance_from_origin(self): + """Subtracts the left most position of the context from the variant position and returns the difference. + + Returns + ------- + int + Context start distance from variant position + """ return abs(self.context_origin - self.context_start) - # Returns the distance of the context end from the context origin. def get_end_distance_from_origin(self): + """Subtracts the variant position from the rightmost position of the context and returns the difference. + + Returns + ------- + int + Context end distance form variant position + """ return abs(self.context_end - self.context_origin) # ===METHODS TO OBTAIN CONTEXT READ INFORMATION============================ - # Returns the number of saved context reads. def get_number_of_context_reads(self): + """Returns the number of reads associated with the context. + + Returns + ------- + int + Number of context reads + """ if self.context_bam_reads is None: return 0 return len(self.context_bam_reads) - # Returns a list of BAM read identifiers in the current context. def get_context_bam_read_ids(self): + """Returns a list of the identifiers of reads associated with the context. + + Returns + ------- + list of str + Identifiers of all reads + """ if self.context_bam_reads is None: return [None] return list(set([x.get_bam_read_id() for x in self.context_bam_reads])) - # Returns a list of all left positions for all BAM reads. def get_context_bam_read_starts(self): + """Returns a list of all let most position of all reads associated with the context. + + Returns + ------- + list of int or None + Leftmost positions of all reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.context_bam_reads] - # Returns a list of all left positions for all R1 BAM reads. def get_context_bam_read_left_positions(self): + """Returns a list with the left most positions for all forward/R1 reads associated with the context. + + Returns + ------- + list of int or None + Leftmost genomic positions of all R1 reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.context_bam_reads if (x.is_read1())] - # Returns a list of BAM read ending positions for all BAM reads. def get_context_bam_read_ends(self): + """Returns a list with the rightmost genomic positions of all reads associated with the context. + + Returns + ------- + list of int or None + Rightmost genomic positions of all reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_end() for x in self.context_bam_reads] - # Returns a list of all right positions for all R2 BAM reads. def get_context_bam_read_right_positions(self): + """Returns a list with the rightmost genomic positions for all reverse/R2 reads associated with the context. + + Returns + ------- + list of int + Rightmost genomic positions of all reverse/R2 reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_end() for x in self.context_bam_reads if (x.is_read2())] - # Returns a list of all lengths for all BAM reads. def get_context_bam_read_lengths(self): + """Returns a list with the lengths of all reads associated with the context. + + Returns + ------- + list of int + Lengths of all reads + """ return [x.get_bam_read_length() for x in self.context_bam_reads] - # Returns a list of BAM read sequences in the current context. def get_context_bam_read_seqs(self): + """Returns a list with the sequences of all reads associated with the context. + + Returns + ------- + list of str + Sequences of all reads + """ return [x.get_bam_read_sequence() for x in self.context_bam_reads] - # Returns a list of qualities of all BAM reads. def get_context_bam_read_qualities(self): + """Returns a list with the qualities lines of all reads associated with the context. + + Return + ------ + list of str + Quality line for all reads + """ return [x.get_bam_read_qual() for x in self.context_bam_reads] - # Returns a list of Q-scores of all BAM reads. def get_context_bam_read_q_scores(self): + """Returns a list with the Q-Scores of all reads associated with the context. + + The Q-Scores for each read is an array of integers. The returned list is therefore a list with lists. + + + Returns + ------- + list + Lists of Q-Scores for all reads + """ return [x.get_bam_read_q_scores() for x in self.context_bam_reads] - # Returns a list of all BAM read MapQ values. def get_context_bam_read_map_qs(self): + """Collects and returns the list of MAPQ values of all reads. + + Returns + ------- + list of int + List with MAPQ values + """ return [x.get_mapping_qual() for x in self.context_bam_reads] - # Returns whether a BAM read is in the context based on the provided - # read identifier. def read_is_in_context(self, readid): + """Checks whether a read specified by an id is associated with the context. + + Returns + ------- + bool + True if the read in the context, False if not + """ if self.context_bam_reads is None: return False return readid in self.get_context_bam_read_ids() # ===METHODS TO ADD/SET CONTEXT DATA======================================= - # Adds the read id of a BAM read with an unmapped mate. def add_unmapped_mate_id(self, ureadid): + """Adds the identifier of a read with an unmapped mate to the context unmapped mates list. + + Parameters + ------- + ureadid : str + Identifier of the read with an unmapped mate + """ self.unmapped_read_mate_ids.append(ureadid) - # Sets the list of unmapped read mate ids. def set_unmapped_mate_ids(self, mateids): + """Sets the provided read id list as the unmapped mate id list associated with the context. + + Parameters + ---------- + mateids : list of str + Read identifiers with unmapped mates + """ self.unmapped_read_mate_ids = mateids - # Returns whether a BAM read in the context has an unmapped mate. def read_has_unmapped_mate(self, readid): + """Checks if a read specified by a read identifier has an unmapped mate and returns True or False. + + Returns + ------- + bool + True if read has unmapped mate, False if not + """ return readid in self.unmapped_read_mate_ids # ===STATISTICS METHODS FOR A VARIANT CONTEXT============================== - # Returns the average and median read length. def get_average_and_median_read_length(self): + """Calculates and return the mean and median red length of all reads associated with the context. + + Returns + ------- + list of int + Mean and median read length + """ if self.context_bam_reads is None: return [None, None] avgmedlen = [] @@ -153,8 +367,14 @@ def get_average_and_median_read_length(self): avgmedlen.append(contextread.get_bam_read_length()) return [statistics.mean(avgmedlen), statistics.median(avgmedlen)] - # Returns the average and median read quality. def get_average_and_median_read_qual(self): + """Calculates and returns the mean and median Q-Score of all reads associated with the context. + + Returns + ------- + list of int + Mean and median read Q-Score + """ if self.context_bam_reads is None: return [None, None] avgmedqual = [] @@ -162,8 +382,14 @@ def get_average_and_median_read_qual(self): avgmedqual.append(contextread.get_average_qscore()) return [statistics.mean(avgmedqual), statistics.median(avgmedqual)] - # Returns the average and median read MapQ of this variant context. def get_average_and_median_read_map_q(self): + """Calculates the mean and median MAPQ value of all reads associated with the context. + + Returns + ------- + list of int + Mean and median read MAPQ + """ if self.context_bam_reads is None: return [None, None] avgmedmapq = [] @@ -172,8 +398,14 @@ def get_average_and_median_read_map_q(self): return [statistics.mean(avgmedmapq), statistics.median(avgmedmapq)] # ===SOME OTHER METHODS==================================================== - # Returns a string representation of the overlap context. def to_string(self): + """Assembles and returns a String representation of the context. + + Returns + ------- + str + String representation of the context + """ if self.context_bam_reads is None: bamids = None countbamreads = 0 @@ -189,8 +421,14 @@ def to_string(self): + str(countbamreads) + "\t" + str(bamids)) - # Returns a statistics string representation of the overlap context. def to_statistics_string(self): + """Calculates some basic statistics of the context in a tab separated String. + + Returns + ------- + str + Line with statistics to write to file + """ avgmedlens = self.get_average_and_median_read_length() avgmedquals = self.get_average_and_median_read_qual() avgmedmapq = self.get_average_and_median_read_map_q() @@ -198,9 +436,23 @@ def to_statistics_string(self): f"{avgmedquals[0]}\t{avgmedquals[1]}\t{avgmedmapq[0]}\t" f"{avgmedmapq[1]}") - # Compares the current OverlapContext to another OverlapContext and - # returns the differences. def compare(self, other_overlap_context): + """Compares the current context to a provided context and returns the differences. + + The context is compared to another context on each aspect. If they differ, the difference is saved in a + dictionary. The key is a numeric value, the difference an array. The first entry is the value of the current + context, the second entry is the value of the other context. + + Parameters + ---------- + other_overlap_context : OverlapContext + Acceptor/Donor context to compare this acceptor/donor context with + + Returns + ------- + differences : dict + Set of differences between the compared acceptor/donor contexts. + """ differences = {} if self.context_id != other_overlap_context.get_context_id(): differences[1] = [self.context_id, diff --git a/ParamChecker.py b/ParamChecker.py index 3b2d568..e272c92 100644 --- a/ParamChecker.py +++ b/ParamChecker.py @@ -4,11 +4,46 @@ class ParamChecker: - # Constructor that creates two empty arrays that + """Checks and saves the values of provided parameters by the user. + + Attributes + ---------- + vaselogger : Logger + Logs VaSeBuilder activity + vcf_filelist : str + Path to the variant list file + bam_filelist : str + Path to the alignment list file + acceptorbam : str + Path to the alignment file to use as acceptor + fastq_in1 : list of str + Paths to R1 fastq files to us as template + fastq_in2 : list of str + Paths to R2 fastq files to use as template + outdir : str + Path to directory to write output files to + reference_file : str + Path to the genome reference in fasta format + fastq_out_location : str + Path to write the validation fastq files to + varcon_out_location : str + Output name for the variant context file + log_location : str + Output path to write the log file to + variantlist_location : str + runmode : str + The mode to run VaSeBuilder in + varconin : str + Path to a variant context input file + donorfqlist : str + required_mode_parameters : dict + Contains the required parameters for each runmode + """ + def __init__(self): self.vaselogger = logging.getLogger("VaSe_Logger") - self.vcf_filelist = [] - self.bam_filelist = [] + self.vcf_filelist = "" + self.bam_filelist = "" self.acceptorbam = "" self.fastq_in1 = "" self.fastq_in2 = "" @@ -21,9 +56,8 @@ def __init__(self): self.runmode = "" self.varconin = "" self.donorfqlist = "" - self.required_mode_parameters = {"A": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", - "out"], - "AC": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", + self.random_seed = 2 + self.required_mode_parameters = {"AC": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", "out"], "D": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"], "DC": ["runmode", "donorvcf", "donorbam", "out", "reference", @@ -35,13 +69,24 @@ def __init__(self): "P": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"], "PC": ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"], - "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"], - "XC": ["runmode", "donorvcf", "donorbam", "out", "reference", - "varconin"]} - self.optional_parameters = ["fastqout", "varcon", "variantlist"] + "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]} + self.optional_parameters = ["fastqout", "varcon", "variantlist", "seed"] - # Checks whether the required parameters for a run mode have been set. def required_parameters_set(self, runmode, vase_arg_vals): + """Checks and returns whether all required run mode parameters have been set. + + Parameters + ---------- + runmode : str + Runmode to check parameters for + vase_arg_vals : dict + Dictionary with parameters values + + Returns + ------- + bool + True if all parameters are correct, False if not + """ if runmode in self.required_mode_parameters: for reqparam in self.required_mode_parameters[runmode]: if vase_arg_vals[reqparam] is None: @@ -53,15 +98,37 @@ def required_parameters_set(self, runmode, vase_arg_vals): self.vaselogger.debug("Invalid run mode selected.") return False - # Checks whether the optional parameters have been set or not. If not, they will be set to None def optional_parameters_set(self, vase_arg_vals): + """Checks and sets optional parameters if they have not been set. + + Parameters + ---------- + vase_arg_vals: + Dictionary with parameter values + + Returns + ------- + vase_arg_vals : dict + Updated dictionary with parameter values + """ for optparam in self.optional_parameters: if optparam not in vase_arg_vals: vase_arg_vals[optparam] = None return vase_arg_vals - # Check the logging parameter to determine where to write the logfile to. def check_log(self, logparam): + """Checks the log parameter value and returns a proper output location to write the log file to. + + Parameters + ---------- + logparam : str + The parameter value for the log file + + Returns + ------- + self.log_location : str + Output path to write the log file to + """ logloc = "VaSeBuilder.log" if logparam is not None: @@ -77,8 +144,24 @@ def check_log(self, logparam): self.log_location = logloc return self.log_location - # Checks whether provided folders exist. def check_folders_exist(self, paramvals, file_exts): + """Checks and returns whether folder containing specified file types exist. + + For each provided folder, it checks whether the folder contains at least one file with the specified file type. + If so, the folder is added to a list. + + Parameters + ---------- + paramvals : list of str + Folders to check for containing specified file type + file_exts : str + Extension of file types to check for + + Returns + ------- + existing_folders : list of str + Folders containing files with the specified extension + """ existing_folders = [] # Check whether the provided folders exist. @@ -103,8 +186,20 @@ def check_folders_exist(self, paramvals, file_exts): existing_folders.append(foldername) return existing_folders - # Checks whether at least one file with a provided extension (.vcf or .bam) is present. def check_folder_contents(self, folder_to_check, file_exts): + """Counts and returns the number of files with a specified extension in a specified folder. + + Parameters + ---------- + folder_to_check: + Folder to check for files with + file_exts: + File extension to check files on + Returns + ------- + vb_count : int + Number of specified files in folder + """ vb_count = 0 for vbfile in os.listdir(folder_to_check): if vbfile.endswith(file_exts): @@ -113,8 +208,19 @@ def check_folder_contents(self, folder_to_check, file_exts): f"{file_exts[0][1:4].upper()} files") return vb_count - # Checks whether a provided file exists. def check_file_exists(self, fileloc): + """Checks and returns whether one or more files exist. + + Parameters + ---------- + fileloc : str or list of str + Path to file or list of paths to files + + Returns + ------- + bool + True if file(s) exists. + """ if type(fileloc) == str: fileloc = [fileloc] for this_file in fileloc: @@ -124,14 +230,36 @@ def check_file_exists(self, fileloc): self.vaselogger.debug("Files all exist.") return True - # Return the directory name of an output location. def is_valid_output_location(self, outfilename): + """Checks and returns whether the output location is valid. + + Parameters + ---------- + outfilename : str + Output file path to check + + Returns + ------- + """ return os.path.isdir(os.path.dirname(outfilename)) - # Checks whether the values of the parameters are correct (do files/folders exist for example). - # [Function should perhaps be split into smaller functions] def check_parameters(self, vase_arg_vals): - + """Checks and returns whether the set parameters are ok. + + For parameters with paths to input files, it is checked whether the file exists. For the output directory it is + checked whether the folder exists. Optional parameters, if in the parameter value set, are set to a default + value if none is given. + + Parameters + ---------- + vase_arg_vals : dict + Parameter values + + Returns + ------- + bool + True if set parameters are correct, False if not + """ # Loop over the provided parameters. for param in vase_arg_vals: self.vaselogger.debug(f"Checking parameter {param}") @@ -217,7 +345,7 @@ def check_parameters(self, vase_arg_vals): self.vaselogger.warning("Variant list parameter used but supplied variant list file " f"{vase_arg_vals[param]} does not exist") - # Checks if the provided donor fastq list file. + # Checks if the provided donor fastq list file exists. if param == "donorfastqs": if vase_arg_vals[param] is not None: if os.path.isfile(vase_arg_vals[param]): @@ -225,10 +353,36 @@ def check_parameters(self, vase_arg_vals): else: self.vaselogger.warning("Donor fastqs parameter used but supplied donorfastq list file" f"{vase_arg_vals[param]} does not exist") + + # Checks if the provided seed value is an integer or float + if param == "seed": + if vase_arg_vals[param] is not None: + if type(vase_arg_vals[param]) is int or type(vase_arg_vals[param]) is float: + self.random_seed = vase_arg_vals[param] + else: + self.random_seed = 2 return True # Only checks whether the required runmode parameters are ok using 'check_parameters()' def check_required_runmode_parameters(self, runmode, vaseargvals): + """Checks and returns whether all required run modes parameters are set and correct. + + The parameter value set is first subsetted to only include all required and optional parameters. For run mode + 'X', the 'templatefq1' parameter is not required and would therefore be filtered out by this method even if it + has been set. + + Parameters + ---------- + runmode : str + Selected run mode + vaseargvals : dict + Parameter values + + Returns + ------- + bool + True if all required parameters are ok, False if not + """ if runmode in self.required_mode_parameters: required_params = self.required_mode_parameters[runmode] @@ -239,14 +393,46 @@ def check_required_runmode_parameters(self, runmode, vaseargvals): return self.check_parameters(filtered_vaseargvals) return False - # Returns the name of the folder name of a parameter value (if the parameter value is ). def get_folder_name(self, foldername): + """Returns the name of the folder for a given path. + + If the provided path is a folder the the provided value is returned. If the provided path is a file, the path to + the parent folder is returned. + + Parameters + ---------- + foldername : str + Path to a valid folder + + Returns + ------- + str + Foldername parameter if foldername is a folder, the path to the parent folder if foldername is a file + """ if os.path.isfile(foldername) or (not os.path.isdir(foldername)): return os.path.dirname(foldername) return foldername # Returns the name of an output file (is used for parameters fastqout, varcon, donorbread and acceptorbread). def get_output_name(self, outfilename, defaultoutname): + """Checks and returns the name of an output file. + + The output name is first checked whether it is a path. If so, only the last part (the filename) is returned. If + no filename or path has been provided, the provided default output name is returned. Only the name, and not + path, is returned as the output folder to write to is determined by the 'out' parameter. + + Parameters + ---------- + outfilename : str + The output file name + defaultoutname : str + The default output name to provide if non has been set/given + + Returns + ------- + str + Returns the output name + """ if outfilename is not None: if "/" in outfilename: return outfilename.split("/")[-1] @@ -255,60 +441,221 @@ def get_output_name(self, outfilename, defaultoutname): return outfilename return defaultoutname - # Returns the list of valid VCF folders. def get_valid_vcf_filelist(self): + """Returns the location of the text file containing a list of donor variant files to use. + + Returns + ------- + self.vcf_filelist : str + Path to the variant list file + """ return self.vcf_filelist - # Returns the list of valid BAM folders. def get_valid_bam_filelist(self): + """Returns the location of the text file containing a list of donor alignment files to use. + + Returns + ------- + self.bamfile_list : str + Path to the alignment list file + """ return self.bam_filelist - # Returns the location of the acceptor BAM file location. def get_acceptor_bam(self): + """Returns the location of the BAM/CRAM file that will be used as the acceptor. + + Returns + ------- + self.acceptorbam : str + Path to the alignment file to use as acceptor + """ return self.acceptorbam - # Returns the location and name of the first (R1) fastq input file location. def get_first_fastq_in_location(self): + """Returns the paths to R1 fastq files that serve as the template for the validation set. + + Returns + ------- + self.fastq_in1 : list of str + Paths to R1 template fastq files + """ return self.fastq_in1 - # Returns the location and name of the second (R2) fastq input file location. def get_second_fastq_in_location(self): + """Returns the paths to R2 fastq files that serve as the template for the validation set. + + Returns + ------- + self.fastq_in2 : list of str + Paths to R2 template fastq files + """ return self.fastq_in2 - # Returns the location(s) and names of the two (R1 and R2) fastq input files. def get_fastq_in_locations(self): + """ + + Returns + ------- + list + """ return [self.fastq_in1, self.fastq_in2] - # Returns the location to write the output to. def get_out_dir_location(self): + """Returns the directory to write the output files. Adds '/' if not present in the ouput directory location. + + Returns + ------- + str + Path to the output directory + """ if self.outdir.endswith("/"): return self.outdir return self.outdir + "/" - # Returns the location of the reference fasta file def get_reference_file_location(self): + """Returns the location of the genomic fasta reference file. + + Returns + ------- + self.reference_file : str + Path to the genomic reference fasta file + """ return self.reference_file # Returns the location of the FastQ file that will be produced by VaSeBuilder. def get_fastq_out_location(self): + """Returns the fastq out location and suffix. + + Before returning the path and output suffix, it is checked whether the path to the outdir location ends with a + '/'. If not this is added. The path and suffix is not a full output path. The full output path is constructed + when writing the new validation fastq files. + + Returns + ------- + str + Path and suffix for the fastq out files + """ if self.outdir.endswith("/"): return self.outdir + self.fastq_out_location return self.outdir + "/" + self.fastq_out_location # Returns the location of file that will contain the variants and their context start and stops. def get_variant_context_out_location(self): + """Constructs and returns the output path for the variant context file. + + Returns + ------- + str + Path to the variant context output location + """ if self.outdir.endswith("/"): return self.outdir + self.varcon_out_location return self.outdir + "/" + self.varcon_out_location - # Retuns the location to write the log file(s) to. def get_log_file_location(self): + """Returns the location the log file will be written to. + + Returns + ------- + self.log_location : str + Path to write the log file to + """ return self.log_location - # Returns the variant list location def get_variant_list_location(self): + """Returns the path to the file containing variants to use. + + Returns + ------- + self.variantlist_location : str + Path to the variant list file + """ return self.variantlist_location - # Returns the location of the list file containing donor fastq files def get_donorfqlist(self): + """Returns the location of the list file containing donor fastq files. + + Returns + ------- + self.donorfqlist : str + Path to the donor fastq list file + """ return self.donorfqlist + + def get_runmode(self): + """Returns the value of the run mode parameter. + + Returns + ------- + self.runmode : str + The set runmode + """ + return self.runmode + + def get_random_seed_value(self): + """Returns the random seed value. + + Returns + ------- + self.random_seed : int or float + Value to use as seed for semi random distribution of donor reads of validation fastq files + """ + return self.random_seed + + def get_variantcontext_infile(self): + """Returns path to the variant context input file. + + Returns + ------- + self.varconin : str + Path to the variant context input file + """ + return self.varconin + + def get_required_runmode_parameters(self, runmode): + """Returns the required parameters for a specified run mode. + + Parameters + ---------- + runmode : str + The specified run mode + + Returns + ------- + list of str + List with the names of the required parameters + """ + if runmode.upper() in self.required_mode_parameters: + return self.required_mode_parameters[runmode.upper()] + return [] + + def get_optional_parameters(self): + """Returns the optional parameters + + Returns + ------- + optional_parameters : list of str + The list of optional program parameters + """ + return self.optional_parameters + + def filter_provided_parameters(self, runmode, vase_parameters): + """Filters and returns a parameter set with only the required and optional parameters + + Parameters + ---------- + runmode : str + The specified run mode + vase_parameters : str + Dictionary with parameter values (not used) + + Returns + ------- + filtered_parameter_set : dict + Parameter setf + """ + filtered_parameter_set = {} + runmode_reqparams = self.get_optional_parameters() + if runmode in self.required_mode_parameters: + runmode_reqparams.extend(self.required_mode_parameters[runmode]) + return filtered_parameter_set diff --git a/ReadIdObject.py b/ReadIdObject.py index b057958..58f2742 100644 --- a/ReadIdObject.py +++ b/ReadIdObject.py @@ -1,10 +1,76 @@ class ReadIdObject: + """The ReadIdObject is used to save the read identifiers. + + This class is used when a VariantContextFile is read. DonorBamRead can't be used as most data is missing. + + Attributes + ---------- + read_id : str + Identifier of the read + pair_number : str + Pair number of the read + """ def __init__(self, readid): + """Constructor saves the provided read identifier and sets the pairnumber to a default empty value.""" self.read_id = readid + self.pair_number = "" - # Returns the name of the def get_bam_read_id(self): + """Returns the identifier of the read. + + Returns + ------- + self.read_id : str + Identifier of the read + """ return self.read_id def set_bam_read_id(self, readid): + """Sets the identifier of the read. Overwrites any already existing value. + + Parameters + ---------- + readid : str + The read identifier to set + """ self.read_id = readid + + def get_bam_read_pair_number(self): + """Returns the read pair number of the read. + + Returns + ------- + self.pair_number : str + The read pair number + """ + return self.pair_number + + def set_bam_read_pair_number(self, pairnum): + """Sets the read pair number of the read. Overwrites any already existing value. + + Parameters + ---------- + pairnum : str + The pair number to set + """ + self.pair_number = pairnum + + def is_read1(self): + """Returns whether the read is the forward/R1 read. + + Returns + ------- + bool + True if read pair number is 1., False if not + """ + return self.pair_number == "1" + + def is_read2(self): + """Returns whether the read is the reverse/R2 read. + + Returns + ------- + bool + True if read pair number is 2, Fale if not + """ + return self.pair_number == "2" diff --git a/VaSeBuilder.py b/VaSeBuilder.py index f788fc7..c5489e6 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -6,15 +6,31 @@ import numpy as np import pysam import time +import random # Import VaSe specific classes. from VcfBamScanner import VcfBamScanner from DonorBamRead import DonorBamRead from VariantContextFile import VariantContextFile from VcfVariant import VcfVariant +from VariantContext import VariantContext +from OverlapContext import OverlapContext class VaSeBuilder: + """Creates the variant contexts, builds validation sets and writes the output files. + + Attributes + ---------- + vb_scanner : VcfBamScanner + Scans and extracts info from variant and alignment files + creation_id : str + Unique (uuid) identifier to identify VaSeBuilder runs + creation_time : str + The date and time of creation to identify VaSeBuilder runs + vaselogger : Logger + VaSeBuilder logger to log VaSeBuilder activity + """ # Constructor that saves the identifier, date, and time of the current run. def __init__(self, vaseid): self.vaselogger = logging.getLogger("VaSe_Logger") @@ -22,11 +38,10 @@ def __init__(self, vaseid): self.creation_id = str(vaseid) self.creation_time = datetime.now() self.vaselogger.info( - f"VaSeBuilder: {self.creation_id} ; {self.creation_time}" - ) + f"VaSeBuilder: {self.creation_id} ; {self.creation_time}" + ) - # VariantContextFile that saves the acceptor, donor, - # and variant contexts with their associated data. + # VariantContextFile that saves the acceptor, donor, and variant contexts with their associated data. self.contexts = VariantContextFile() # Dictionary used for debug messages. @@ -40,6 +55,17 @@ def __init__(self, vaseid): "car": "Gathering combined context acceptor reads", "done": "Variant complete. Processing"} + self.cigar_tuple_table = {"M": 0, + "I": 1, + "D": 2, + "N": 3, + "S": 4, + "H": 5, + "P": 6, + "-": 7, + "X": 8, + "B": 9} + # Method to print debug messages. def debug_msg(self, step, variant_id, t0=False): process = self.debug_dict[step] @@ -68,6 +94,30 @@ def build_varcon_set(self, sampleidlist, variant locus, and read IDs per variant in an external file, along with other relevant information. Optionally outputs additional statistics files. + + Parameters + ---------- + sampleidlist : list of str + Sample names/identifiers to process + vcfsamplemap : dict + Variant files per sample + bamsamplemap : dict + Alignment files per sample + acceptorbamloc : str + Path to the alignment file to use as acceptor + outpath : str + Path to folderto write outut files to + reference_loc : str + Path to genome reference fasta file + varcon_outpath : str + Path and suffix to write variant context output file to + variant_list : str + Path to file containing variants to process + + Returns + ------- + self.contexts : dict + Variant contexts per variant context identifier """ self.vaselogger.info("Begin building the variant context file.") donor_vcfs_used = [] @@ -76,9 +126,9 @@ def build_varcon_set(self, sampleidlist, # Open the acceptor bam, or exit if this fails. try: acceptorbamfile = pysam.AlignmentFile( - acceptorbamloc, - reference_filename=reference_loc - ) + acceptorbamloc, + reference_filename=reference_loc + ) except IOError: self.vaselogger.critical("Could not open acceptor BAM file. " "Exitting.") @@ -95,9 +145,9 @@ def build_varcon_set(self, sampleidlist, if sampleid in variant_list: sample_variant_filter = variant_list[sampleid] samplevariants = self.get_sample_vcf_variants( - vcfsamplemap[sampleid], - sample_variant_filter - ) + vcfsamplemap[sampleid], + sample_variant_filter + ) # Skip this sample if all of its variants got filtered out. if not samplevariants: @@ -109,9 +159,9 @@ def build_varcon_set(self, sampleidlist, # for this sample, or skip this samples if this fails. try: bamfile = pysam.AlignmentFile( - bamsamplemap[sampleid], - reference_filename=reference_loc - ) + bamsamplemap[sampleid], + reference_filename=reference_loc + ) self.vaselogger.debug("Opened BAM file " f"{bamsamplemap[sampleid]}") except IOError: @@ -135,44 +185,35 @@ def build_varcon_set(self, sampleidlist, vchr = vcfvar.get_variant_chrom() vpos = vcfvar.get_variant_pos() - # Determine whether the variant is SNP or indel, - # based on the length of its alleles. + # Determine whether the variant is SNP or indel, based on the length of its alleles. self.debug_msg("vw", variantid) vtype = vcfvar.get_variant_type() - - self.vaselogger.debug(f"Variant {variantid} " - f"determined to be a {vtype}.") + self.vaselogger.debug(f"Variant {variantid} determined to be a {vtype}.") # Determine the length of the search window # based on the length of the variant. searchwindow = self.determine_read_search_window(vtype, vcfvar) - self.vaselogger.debug( - "Search window determined to be " - f"{vchr}:{searchwindow[0]+1}-{searchwindow[1]}." - ) + self.vaselogger.debug(f"Search window determined to be {vchr}:{searchwindow[0]+1}-{searchwindow[1]}.") - # Check if this window overlaps any already in-use contexts; - # If so, skip this variant. + # Check if this window overlaps any already in-use contexts; If so, skip this variant. if self.contexts.variant_is_in_context(vtype, vchr, *searchwindow): - self.vaselogger.debug(f"Variant {variantid} is located in " - "an existing variant context.") + self.vaselogger.debug(f"Variant {variantid} is located in an existing variant context.") continue # Begin fetching reads and determining contexts. try: # === DONOR ===================================== - # Obtain all donor BAM reads at the variant - # position, as well as their mates. + # Obtain all donor BAM reads at the variant position, as well as their mates. self.debug_msg("dr", variantid) t0 = time.time() donor_context_reads = ( - self.get_variant_reads(variantid, - vchr, - *searchwindow, - bamfile, - True, - donor_unmapped) - ) + self.get_variant_reads(variantid, + vchr, + *searchwindow, + bamfile, + True, + donor_unmapped) + ) self.debug_msg("dr", variantid, t0) # If no donor reads were found at this position, @@ -181,7 +222,7 @@ def build_varcon_set(self, sampleidlist, self.vaselogger.info( f"No reads found for variant {variantid}" f" in donor {sampleid}; Skipping." - ) + ) continue # Determine the donor context based on @@ -190,7 +231,7 @@ def build_varcon_set(self, sampleidlist, t0 = time.time() donor_context = ( self.determine_context(donor_context_reads, vpos, vchr) - ) + ) self.debug_msg("dc", variantid, t0) # === ACCEPTOR ============================================ @@ -199,24 +240,24 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("ar", variantid) t0 = time.time() acceptor_context_reads = ( - self.get_variant_reads(variantid, - vchr, - *searchwindow, - acceptorbamfile, - True, - acceptor_unmapped) - ) + self.get_variant_reads(variantid, + vchr, + *searchwindow, + acceptorbamfile, + True, + acceptor_unmapped) + ) self.debug_msg("ar", variantid, t0) # Only perform the following if no reads were # found in the acceptor at the variant location. if not acceptor_context_reads: self.vaselogger.warning( - f"No reads found for variant {variantid} in " - "acceptor. Acceptor and donor sequencing may " - "have been performed with different methods. " - "Proceeding anyway." - ) + f"No reads found for variant {variantid} in " + "acceptor. Acceptor and donor sequencing may " + "have been performed with different methods. " + "Proceeding anyway." + ) # Create a dummy list of reads. acceptor_context_reads = None # Temporarily set acceptor context equal to donor. @@ -233,7 +274,7 @@ def build_varcon_set(self, sampleidlist, self.determine_context(acceptor_context_reads, vpos, vchr) - ) + ) self.debug_msg("ac", variantid, t0) # === COMBINED CONTEXT ==================================== @@ -243,20 +284,20 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("cc", variantid) t0 = time.time() variant_context = ( - self.determine_largest_context(vpos, - acceptor_context, - donor_context) - ) + self.determine_largest_context(vpos, + acceptor_context, + donor_context) + ) self.debug_msg("cc", variantid, t0) # If this widest context overlaps an # existing variant context, skip it. if self.contexts.context_collision(variant_context): self.vaselogger.debug( - f"Variant context {variantid} overlaps " - "with an already existing variant context; " - "Skipping." - ) + f"Variant context {variantid} overlaps " + "with an already existing variant context; " + "Skipping." + ) continue # Obtain all donor reads overlapping the @@ -264,14 +305,14 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("cdr", variantid) t0 = time.time() variant_context_donor_reads = ( - self.get_variant_reads(variantid, - variant_context[0], - variant_context[2], - variant_context[3], - bamfile, - True, - varcon_unmapped_d) - ) + self.get_variant_reads(variantid, + variant_context[0], + variant_context[2], + variant_context[3], + bamfile, + True, + varcon_unmapped_d) + ) self.debug_msg("cdr", variantid, t0) # Obtain all acceptor reads overlapping the @@ -279,14 +320,14 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("car", variantid) t0 = time.time() variant_context_acceptor_reads = ( - self.get_variant_reads(variantid, - variant_context[0], - variant_context[2], - variant_context[3], - acceptorbamfile, - True, - varcon_unmapped_a) - ) + self.get_variant_reads(variantid, + variant_context[0], + variant_context[2], + variant_context[3], + acceptorbamfile, + True, + varcon_unmapped_a) + ) self.debug_msg("car", variantid, t0) # If still no acceptor reads were found in the combined @@ -297,45 +338,45 @@ def build_varcon_set(self, sampleidlist, # Add the combined, donor, and acceptor contexts along # with their reads to the current list of contexts. self.contexts.add_variant_context( - variantid, - sampleid, - *variant_context, - variant_context_acceptor_reads, - variant_context_donor_reads - ) + variantid, + sampleid, + *variant_context, + variant_context_acceptor_reads, + variant_context_donor_reads + ) self.contexts.add_donor_context( - variantid, - sampleid, - *donor_context, - donor_context_reads - ) + variantid, + sampleid, + *donor_context, + donor_context_reads + ) self.contexts.add_acceptor_context( - variantid, - sampleid, - *acceptor_context, - acceptor_context_reads - ) + variantid, + sampleid, + *acceptor_context, + acceptor_context_reads + ) # Add the read identifiers of reads with # unmapped mates. self.contexts.set_unmapped_acceptor_mate_ids( - variantid, - varcon_unmapped_a - ) + variantid, + varcon_unmapped_a + ) self.contexts.set_unmapped_donor_mate_ids( - variantid, - varcon_unmapped_d - ) + variantid, + varcon_unmapped_d + ) self.contexts.set_acceptor_context_unmapped_mate_ids( - variantid, - acceptor_unmapped - ) + variantid, + acceptor_unmapped + ) self.contexts.set_donor_context_unmapped_mate_ids( - variantid, - donor_unmapped - ) + variantid, + donor_unmapped + ) except IOError: self.vaselogger.warning("Could not obtain BAM reads from " @@ -357,7 +398,7 @@ def build_varcon_set(self, sampleidlist, # Write the variant contexts to file. self.vaselogger.info("Writing variant contexts to " f"{varcon_outpath}") - self.contexts.write_variant_context_file(varcon_outpath) + self.contexts.write_variant_context_file(varcon_outpath, self.creation_id) # Write the variant context statistics file. self.vaselogger.info("Writing variant context statistics to " @@ -388,7 +429,7 @@ def build_varcon_set(self, sampleidlist, if self.vaselogger.getEffectiveLevel() == 10: self.write_optional_output_files(outpath, self.contexts) acceptorbamfile.close() - return + return self.contexts def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc): """Reads an existing variant context file and fetches donor reads from @@ -398,6 +439,12 @@ def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc): set from a pre-built variant context file. Does not output a new variant context file, and does not store any read information for the acceptor except for read IDs per variant. + + Parameters + ---------- + varc_file + bamsamplemap + reference_loc """ # Reads in an existing variant context file. self.contexts = VariantContextFile(varc_file) @@ -433,7 +480,7 @@ def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc): context.variant_context_start, context.variant_context_end, bamfile) - ) + ) # Close the sample's BAM file when done. bamfile.close() return @@ -450,7 +497,6 @@ def build_validation_set(self, run_mode, acceptor_bam, Depending on mode, will either output combined acceptor/donor FastQs, or donor-only FastQs. """ - # Per-variant Fq output functionality. if "P" in run_mode: self.vaselogger.info(f"Begin writing variant FastQ files.") @@ -470,8 +516,8 @@ def build_validation_set(self, run_mode, acceptor_bam, if "F" in run_mode: # Set up a set of all acceptor fastq reads to skip. skip_list = set( - self.contexts.get_all_variant_context_acceptor_read_ids() - ) + self.contexts.get_all_variant_context_acceptor_read_ids() + ) for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): # Write the fastq files. @@ -488,8 +534,26 @@ def build_validation_set(self, run_mode, acceptor_bam, self.vaselogger.info("Finished writing FastQ files.") return - # Checks whether a value is in a filter list (array or set) def passes_filter(self, val_to_check, filter_to_use, is_exclude_filter=False): + """Checks whether a value is in a filter list. + + A value can be checked against either an inclusion or exclusion filter. An inclusion filter should be the values + that should be included and used, an exclusion filter for values to be excluded and not used. + + Parameters + ---------- + val_to_check : str + Value to check against filter + filter_to_use : list of str + Values to use as filter + is_exclude_filter : bool + Is the filter an exclusion filter (false for inclusion filter) + + Returns + ------- + bool + True if value in an inclusive filter or not in an exclusive filter, False otherwise + """ if filter_to_use is not None: if is_exclude_filter: if val_to_check in filter_to_use: @@ -503,6 +567,20 @@ def passes_filter(self, val_to_check, filter_to_use, is_exclude_filter=False): # Returns a list of VcfVariant objects from a VCF file. A filter can be used to select specific variants. def get_sample_vcf_variants(self, vcf_fileloc, filterlist=None): + """Reads and returns read variants from a variant file. + + Parameters + ---------- + vcf_fileloc : str + Path to variant file to read + filterlist : list of tuple + Variants to include + + Returns + ------- + sample_variant_list : list of VcfVariant + Read variants fro the variant file + """ sample_variant_list = [] try: vcf_file = pysam.VariantFile(vcf_fileloc, "r") @@ -517,8 +595,24 @@ def get_sample_vcf_variants(self, vcf_fileloc, filterlist=None): finally: return sample_variant_list - # Returns whether a variant is a SNP or indel. def determine_variant_type(self, vcfvariantref, vcfvariantalts): + """Determines and returns the variant type. + + The type of variant is determined based on the lengths of the reference and alternative allele(s). Currently + only SNP or indel is returned aas variant type. + + Parameters + ---------- + vcfvariantref : str + Variant reference allele(s) + vcfvariantalts : tuple of str + Variant alternative allele(s) + + Returns + ------- + str + Variant type + """ # Determine the maximum reference allele length. maxreflength = max([len(x) for x in vcfvariantref.split(",")]) # Determine the maximum alternative allele length. @@ -532,9 +626,25 @@ def determine_variant_type(self, vcfvariantref, vcfvariantalts): return "indel" return "?" - # Determines the start and end positions to use for searching reads - # overlapping with the variant. def determine_read_search_window(self, varianttype, vcfvariant): + """Determines and returns the search window for fetching reads. + + The determined search window depends on the provided variant type. SNPs return a search window of SNP position + -1 and +1. Indels return a search window based indel lefmost position and length. If the variant type is + neither SNP or indel, the search window returned will be [-1, -1]. + + Parameters + ---------- + varianttype : str + Variant type to determine search window for + vcfvariant : VcfVariant + Variant to determine search window with + + Returns + ------- + list of int + Search window start and stop, -1 and -1 if variant type is invalid + """ if varianttype == "snp": return [vcfvariant.get_variant_pos() - 1, vcfvariant.get_variant_pos() + 1] elif varianttype == "indel": @@ -543,80 +653,232 @@ def determine_read_search_window(self, varianttype, vcfvariant): vcfvariant.get_variant_alt_alleles()) return [-1, -1] - # Returns the search start and stop to use for searching BAM reads - # overlapping with the range of the indel. + # Returns the search start and stop to use for searching BAM reads overlapping with the range of the indel. def determine_indel_read_range(self, variantpos, variantref, variantalts): + """Determines and returns the search start and stop to use for an indel. + + Parameters + ---------- + variantpos : int + Leftmost genomic position of the variant + variantref : str + Variant reference allele(s) + variantalts : tuple + Variant alternative allele(s) + + Returns + ------- + list of int + Search window start and stop + """ searchstart = variantpos searchstop = variantpos + max( - max([len(x) for x in variantref.split(",")]), - max([len(x) for x in variantalts]) - ) + max([len(x) for x in variantref.split(",")]), + max([len(x) for x in variantalts]) + ) return [searchstart, searchstop] - # Returns the BAM reads containing the specific vcf variant as well - # as their read mate. def get_variant_reads(self, contextid, variantchrom, variantstart, variantend, bamfile, write_unm=False, umatelist=None): - # Obtain all the variant reads overlapping with the variant and - # their mate reads. + """Fetches and returns reads overlapping with a specified variant. + + First reads overlapping directly with the variant position are fetched. Then the read mates are fetched using + the RNEXT and PNEXT values of each read. Lastly, it is ensured that each read only occurs once. + + Parameters + ---------- + contextid : str + Identifier of the context to fetch reads for + variantchrom : str + Chromosome name to fetch reads from + variantstart : int + Leftmost genomic position to use for fetching + variantend : int + Rightmost genomic position to use for fetching + bamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile + write_unm : bool + Save read identifiers with an unmapped mate + umatelist : list of str + Identifiers of reads with an unmapped mate + + Returns + ------- + variantreads : list of DonorBamRead + Fetched reads and their read mates + """ + hardclipped_read_num = 0 + duplicate_read_num = 0 + secondary_read_num = 0 + + read_objects = [] variantreads = [] - rpnext = {} + clipped_reads = [] + list_r1 = [] + list_r2 = [] for vread in bamfile.fetch(variantchrom, variantstart, variantend): - variantreads.append(DonorBamRead( - vread.query_name, - self.get_read_pair_num(vread), - vread.reference_name, - vread.reference_start, - vread.infer_read_length(), - vread.get_forward_sequence(), - "".join([chr((x + 33)) - for x in vread.get_forward_qualities()]), - vread.mapping_quality - )) - rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] - - # Obtain the read mate (this must be done after the initial fetch not during!) - for read1 in rpnext.values(): - materead = self.fetch_mate_read(*read1, bamfile) - if materead is not None: - variantreads.append(materead) - else: - if write_unm: - self.vaselogger.debug("Could not find mate for " - f"{read1[2]} ; " - "mate is likely unmapped.") - umatelist.append(read1[2]) - - # Make sure the list only contains each BAM read once (if a read - # and mate both overlap with a variant, they have been added - # twice to the list). - uniq_variantreads = [] + if vread.is_duplicate: + duplicate_read_num += 1 + if vread.is_secondary: + secondary_read_num += 1 + if self.read_is_hard_clipped(vread): + hardclipped_read_num += 1 + clipped_reads.append(vread) + continue + read_objects.append(vread) + + self.vaselogger.debug(f"Fetched {hardclipped_read_num} reads with hardclipped bases") + for clipped in clipped_reads: + read_objects.append(self.fetch_primary_from_secondary(clipped, bamfile)) + + for vread in read_objects: + if vread.is_read1: + list_r1.append(vread) + elif vread.is_read2: + list_r2.append(vread) + + list_r1_ids = [x.query_name for x in list_r1] + list_r2_ids = [x.query_name for x in list_r2] + + for r1 in list_r1: + if r1.query_name not in list_r2_ids: + list_r2.append(self.fetch_mate_read(r1.query_name, r1.next_reference_name, r1.next_reference_start, + self.get_read_pair_num(r1), bamfile)) + for r2 in list_r2: + if r2.query_name not in list_r1_ids: + list_r1.append(self.fetch_mate_read(r2.query_name, r2.next_reference_name, r2.next_reference_start, + self.get_read_pair_num(r2), bamfile)) + + for vread in list_r1 + list_r2: + variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), + vread.reference_name, vread.reference_start, vread.infer_read_length(), + vread.reference_end, vread.cigarstring, vread.next_reference_name, + vread.next_reference_start, + vread.template_length, vread.get_forward_sequence(), + "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), + vread.mapping_quality)) + + variantreads = self.uniqify_variant_reads(variantreads) + return variantreads + + def fetch_primary_from_secondary(self, secondary_read, bamfile): + """Fetches and returns the primary alignment of a read based on the position recorded in its SA tag. + + The SA tag is read from a provided Pysam read object. Then, reads at the recorded alignment position are + fetched. The first non-secondary alignment with a matching read ID and pair number is returned. + + Parameters + ---------- + secondary_read : pysam.AlignedSegment + Secondary alignment whose primary alignment will be located + bamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile + + Returns + ------- + primary_read : pysam.AlignedSegment + Primary alignment with the same read ID and pair number as the input secondary alignment. + """ + primary_locus = secondary_read.get_tag("SA").split(",")[0:2] + for fetched in bamfile.fetch(primary_locus[0], int(primary_locus[1]) - 1, int(primary_locus[1])): + if ((fetched.query_name == secondary_read.query_name) + and (fetched.is_read1 == secondary_read.is_read1) + and not fetched.is_secondary): + primary_read = fetched + break + return primary_read + +# ============================================================================= +# def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): +# """Fetches and returns read mates for a set of reads. +# +# Read mates are fetched from an already opened pysam alignment file using the RNEXT and PNEXT values as well as +# the read identifier. +# +# Parameters +# ---------- +# rpnextmap : dict +# bamfile : pysam.AlignmentFile +# Already opened pysam alingment file to fetch read mates from +# variantreadlist : list of DonorBamRead +# Reads to fetch read mates for +# write_unm : bool +# Write identifiers of reads with unmapped mates +# umatelist : list of str +# Identifiers with reads that have an unmapped mate +# +# Returns +# ------- +# variantreadlist : list of DonorBamRead +# Updated list of reads and the added read mates +# """ +# hardclipped_read_num = 0 +# for read1 in rpnextmap.values(): +# materead = self.fetch_mate_read(*read1, bamfile) +# if materead is not None: +# if materead.read_has_hardclipped_bases(): +# hardclipped_read_num += 1 +# variantreadlist.append(materead) +# else: +# if write_unm: +# self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.") +# umatelist.append(read1[2]) +# self.vaselogger.debug(f"Fetched {hardclipped_read_num} read mates with hardclipped bases") +# return variantreadlist +# ============================================================================= + + def uniqify_variant_reads(self, variantreads): + """Ensures each read only occurs once and returns the updates set. + + Parameters + ---------- + variantreads : list of DonorBamRead + Sets of reads to process + + Returns + ------- + unique_variantreads : list of DonorBamRead + Set of reads with each read occuring only once + """ + unique_variantreads = [] checklist = [] for fetched in variantreads: id_pair = (fetched.get_bam_read_id(), fetched.get_bam_read_pair_number()) if id_pair not in checklist: - uniq_variantreads.append(fetched) + unique_variantreads.append(fetched) checklist.append(id_pair) - variantreads = uniq_variantreads - - # Filter to keep only read pairs. - variantreads = self.filter_variant_reads(variantreads) - self.vaselogger.debug(f"Found a total of {len(variantreads)} " - "BAM reads.") - return variantreads - - # Returns the mate read using the fetch() method - def fetch_mate_read(self, rnext, pnext, readid, bamfile): + return unique_variantreads + + def fetch_mate_read(self, readid, rnext, pnext, pair_num, bamfile): + """Fetches and returns the mate read of a specified read. + + The mate read is fetched from an opened alignment file using the RNEXT and PNEXT value of the read. Of the + fetched reads, only the read with the same identifier is returned. + + Parameters + ---------- + rnext : str + Chromosome name the mate read is located on + pnext : int + Leftmost genomic position of the mate read + readid : str + Identifier of the read to search the mate for + bamfile : pysam.AlignmentFile + Already openend pysam alignment file + + Returns + ------- + DonorBamRead or None + The mate read if found, None if not + """ for bamread in bamfile.fetch(rnext, pnext, pnext + 1): - if bamread.query_name == readid and bamread.reference_start == pnext: - return DonorBamRead(bamread.query_name, self.get_read_pair_num(bamread), bamread.reference_name, - bamread.reference_start, bamread.infer_read_length(), - bamread.get_forward_sequence(), - "".join([chr((x + 33)) for x in bamread.get_forward_qualities()]), - bamread.mapping_quality) + if (bamread.query_name == readid + and bamread.reference_start == pnext + and self.get_read_pair_num(bamread) != pair_num): + return bamread return None # Filters the donor reads to keep only reads that occur twice. @@ -624,14 +886,44 @@ def filter_variant_reads(self, bamreads): return [bread for bread in bamreads if (self.read_occurence(bread.get_bam_read_id(), bamreads) == 2)] - # Returns the number of occurences of a certain read in the list of - # BAM reads (should be two ideally). def read_occurence(self, readid, readlist): + """Checks and returns the occurence of a specified read. + + Parameters + ---------- + readid : str + Identifier of the read to check + readlist : list of DonorBamRead + Set of reads to check occurence of read in + + Returns + ------- + int + Occurences of the specified read + """ return sum([bamread.get_bam_read_id() == readid for bamread in readlist]) - # Determines the start and stops of the variant context (please see - # the documentation for more information). def determine_context(self, contextreads, contextorigin, contextchr): + """Determines and returns an acceptor/donor context. + + The acceptor/donor context is determined from a set of reads overlapping with the variant including their read + mates. The leftmost and rightmost genomic positions of all reads are collected and filtered for outliers. + The context start (leftmost genomic) position is then determined by taking minimum and maximum leftmost and + rightmost position respectively. + + Parameters + ---------- + contextreads: list of DonorBamRead + contextorigin : int + Variant genomic position the context will be based on + contextchr : str + Chromosomae name the context is located on + + Returns + ------- + list of str and int + Essential context data (chromosome, variant pos, start, end) + """ # Check whether there are reads to determine the context for. if not contextreads: return [] @@ -644,7 +936,7 @@ def determine_context(self, contextreads, contextorigin, contextchr): stops = [conread.get_bam_read_ref_end() for conread in contextreads - if conread.get_bam_read_chrom() == contextchr] + if conread.get_bam_read_chrom() == contextchr and conread.get_bam_read_ref_end() is not None] # Number of mates filtered for mapping to different chr. num_diff_chr = len(contextreads) - len(starts) @@ -666,15 +958,23 @@ def determine_context(self, contextreads, contextorigin, contextchr): # Set variant context as chr, min start, max end. contextstart = min(filtered_starts) contextend = max(filtered_stops) - self.vaselogger.debug("Context is " - + str(contextchr) + ", " - + str(contextstart) + ", " - + str(contextend)) return [contextchr, contextorigin, contextstart, contextend] - # Filters outliers from a list of start/stop positions using - # Tukey's Fences method. def filter_outliers(self, pos_list, k=3): + """Filters outliers from a list of start or stop positions and returns the filtered list. + + Outlier start/stop positions are filtered from the list using Tukey's Fences method. For more info please see + https://en.wikipedia.org/wiki/Outlier#Tukey's_fences + + Parameters + ---------- + pos_list : list of int + Start/stop positions + k : int + Factor to determine outlier + Returns + ------- + """ # First and third quartile values of the positions. q1 = np.percentile(pos_list, 25) q3 = np.percentile(pos_list, 75) @@ -686,10 +986,24 @@ def filter_outliers(self, pos_list, k=3): if ((q1 - (k * iq)) <= x <= (q3 + (k * iq)))] return filtered - # Determines the size of the variant context based on both the - # acceptor and donor reads. def determine_largest_context(self, contextorigin, acceptor_context, donor_context): + """Determines the size of the variant context based on both the acceptor and donor reads. + + Parameters + ---------- + contextorigin : int + Variant position that the context is based on + acceptor_context : list of str and int + Essential acceptor context data + donor_context : list of str and int + Essential donor context data + + Returns + ------- + list of str and int + Context (chromosome, variant pos, start, end) + """ largest_context = [donor_context[0]] largest_context.append(contextorigin) # Determine and save the smallest context start. @@ -767,9 +1081,20 @@ def write_vase_fastq(self, acceptor_infq, fastq_outpath, exit() def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath): + """Build and writes a fastq file containing only donor reads. + + Parameters + ---------- + donorbamreaddata : list of tuple + Donor reads to write to fastq file + fr : str + Write forward ('1') or reverse ('2') fastq file + fastq_outpath : str + Path and name to write donor fastq file to + """ # Write the new VaSe FastQ file. - vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1) - fqgz_outfile = io.BufferedWriter(open(vasefq_outname, "wb")) + # vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1) + fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) donorbamreaddata.sort(key=lambda dbr: dbr[0], reverse=False) for bamread in donorbamreaddata: @@ -783,29 +1108,72 @@ def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath): fqgz_outfile.flush() fqgz_outfile.close() - # Checks if a read is read 1 (R1) or read 2 (R2). def is_required_read(self, bamread, fr): + """Checks and returns whether the current read is the one that is required. + + When writing the validation fastq files only R1, or forward (F), reads should go to the R1 validation fastq + file. Similarly, the R2 reads should only go into the R2 validation fastq file. + + Parameters + ---------- + bamread : DonorBamRead + Read to check + fr : str + Whether the read is required for R1 or R2 + + Returns + ------- + bool + True if the read is required, False if not + """ if fr == "F": return bamread.is_read1() return bamread.is_read2() - # Returns the name for the fastq out file. def set_fastq_out_path(self, outpath, fr, lnum): + """Sets and returns the fastq output path and filename. + + Parameters + ---------- + outpath : str + Path and suffix to write fastq file to + fr: str + Will the fastq file be R1 ('1') or R2 ('2') + lnum: int + Lane number (i.e.: 1, 2, 3 or 4) + + Returns + ------- + str + Full path to write fastq file to + """ if fr == "1": return f"{outpath}_{datetime.now().date()}_L{lnum}_R1.fastq" return f"{outpath}_{datetime.now().date()}_L{lnum}_R2.fastq" # ===METHODS TO OBTAIN SOME DATA OF THE VASEBUILDER OBJECT================= - # Returns the identifier of the current VaSeBuilder object. def get_creation_id(self): + """Returns the identifier of the current VaSeBuilder object. + + Returns + ------- + self.creation_id : str + VaSeBuilder creation identifier + """ return self.creation_id # Returns the date the current VaSeBuilder object has been made. # def get_creation_date(self): - # return self.creation_date + # return self.creation_date - # Returns the time the current VaSeBuilder object has been made. def get_creation_time(self): + """Returns the date and time the current VaSeBuilder object has been made. + + Returns + ------- + str + VaSeBuilder creation date and time + """ return self.creation_time # Returns all variant contexts. @@ -845,123 +1213,156 @@ def get_searchchrom_prefix(self, bamfile): return "" # ===METHODS TO WRITE OUTPUT FILES========================================= - # Writes the used donor vcf files to a file def write_used_donor_files(self, outfileloc, filesamplemap, - used_donor_files): + used_donor_files, vbuuid): + """Writes the donor alignment or variant files used in constructing variant contexts to an output file. + + Parameters + ---------- + outfileloc : str + Path to write the output file to + filesamplemap : dict + Donor files per sample + used_donor_files : list + Donor alignment or variant files used to construct varian contexts + """ try: with open(outfileloc, "w") as outfile: + outfile.write(f"#VBUUID: {vbuuid}\n") outfile.write("#SampleId\tDonorFile\n") for sampleid, samplefile in filesamplemap.items(): - if (samplefile in used_donor_files): + if samplefile in used_donor_files: outfile.write(f"{sampleid}\t{samplefile}\n") except IOError as ioe: self.vaselogger.critical("Could not write used donor files to " f"{outfileloc}") - # Writes optional debug level output files. def write_optional_output_files(self, outpath, contextfile): + """Writes optional output files to a specified output location. + + The optional output files contain data about acceptor and donor contexts as well as output files containing the + reads with unmapped mates per variant context. The optional output files are only produced when VaSeBuilder is + run with with debug set to True. + + Parameters + ---------- + outpath: str + Path to write the optional output file to + contextfile: VariantContextFile + Variant context data + """ # Write the optional acceptor context files; acceptor contexts, # read ids with unmapped mate and left/right positions. self.vaselogger.debug("Writing acceptor contexts to " f"{outpath}acceptorcontexts.txt") - contextfile.write_acceptor_context_file(f"{outpath}acceptorcontexts.txt") + contextfile.write_acceptor_context_file(f"{outpath}acceptorcontexts.txt", self.creation_id) self.vaselogger.debug("Writing acceptor context statistics to " f"{outpath}acceptorcontextstats.txt") - contextfile.write_acceptor_context_stats( - f"{outpath}acceptorcontextstats.txt" - ) + contextfile.write_acceptor_context_stats(f"{outpath}acceptorcontextstats.txt", self.creation_id) self.vaselogger.debug("Writing acceptor context read identifiers with " "unmapped mates to" f"mates to {outpath}acceptor_unmapped.txt") - contextfile.write_acceptor_unmapped_mates( - f"{outpath}acceptor_unmapped.txt" - ) + contextfile.write_acceptor_unmapped_mates(f"{outpath}acceptor_unmapped.txt", self.creation_id) self.vaselogger.debug("Writing left and right most read positions of " "each acceptor context to " f"{outpath}acceptor_positions.txt") - contextfile.write_acceptor_left_right_positions( - f"{outpath}acceptor_positions.txt" - ) + contextfile.write_acceptor_left_right_positions(f"{outpath}acceptor_positions.txt", self.creation_id) - # Write the optional donor context files; donor contexts, - # read ids with unmapped mate and left/right positions. + # Write the optional donor context files; donor contexts, read ids with unmapped mate and left/right positions. self.vaselogger.debug("Writing donor contexts to " f"{outpath}donorcontexts.txt") - contextfile.write_donor_context_file(f"{outpath}donorcontexts.txt") + contextfile.write_donor_context_file(f"{outpath}donorcontexts.txt", self.creation_id) self.vaselogger.debug("Writing donor context statistics to " f"{outpath}donorcontextstats.txt") - contextfile.write_donor_context_stats(f"{outpath}donorcontextstats.txt") + contextfile.write_donor_context_stats(f"{outpath}donorcontextstats.txt", self.creation_id) self.vaselogger.debug("Writing donor context read identifiers " - "with unmapped mates to " - f"{outpath}donor_unmapped.txt") - contextfile.write_donor_unmapped_mates(f"{outpath}donor_unmapped.txt") + f"with unmapped mates to {outpath}donor_unmapped.txt") + contextfile.write_donor_unmapped_mates(f"{outpath}donor_unmapped.txt", self.creation_id) self.vaselogger.debug("Writing left and right most read positions " - "of each donor context to " - f"{outpath}donor_positions.txt") - contextfile.write_donor_left_right_positions( - f"{outpath}donor_positions.txt" - ) - - # Write the optional variant context files; acceptor & donor - # unmapped mates and left/right positions. + f"of each donor context to {outpath}donor_positions.txt") + contextfile.write_donor_left_right_positions(f"{outpath}donor_positions.txt", self.creation_id) + + # Write the optional variant context files; acceptor & donor unmapped mates and left/right positions. self.vaselogger.debug("Writing variant context acceptor read " - "identifiers with unmapped mates to " - f"{outpath}varcon_unmapped_acceptor.txt") - contextfile.write_reads_with_unmapped_mate( - "acceptor", - f"{outpath}varcon_unmapped_acceptor.txt" - ) + f"identifiers with unmapped mates to {outpath}varcon_unmapped_acceptor.txt") + contextfile.write_reads_with_unmapped_mate("acceptor", f"{outpath}varcon_unmapped_acceptor.txt", + self.creation_id) self.vaselogger.debug("Writing variant context donor read identifiers " - "with unmapped mates to " - f"{outpath}varcon_unmapped_donor.txt") - contextfile.write_reads_with_unmapped_mate( - "donor", - f"{outpath}varcon_unmapped_donor.txt" - ) + f"with unmapped mates to {outpath}varcon_unmapped_donor.txt") + contextfile.write_reads_with_unmapped_mate("donor", f"{outpath}varcon_unmapped_donor.txt", self.creation_id) self.vaselogger.debug("Writing variant context left and right most " - "read positions of acceptor reads to " - f"{outpath}varcon_positions_acceptor.txt") + f"read positions of acceptor reads to {outpath}varcon_positions_acceptor.txt") contextfile.write_left_right_positions( - "acceptor", - f"{outpath}varcon_positions_acceptor.txt" - ) + "acceptor", + f"{outpath}varcon_positions_acceptor.txt", self.creation_id + ) self.vaselogger.debug("Writing variant context left and right most " "read positions of donor reads to " f"{outpath}varcon_positions_donor.txt") contextfile.write_left_right_positions( - "donor", - f"{outpath}varcon_positions_donor.txt" - ) - - # Writes a BED file for the variant context data - def write_bed_file(self, variantcontextdata, bedoutloc): + "donor", + f"{outpath}varcon_positions_donor.txt", self.creation_id + ) + + def write_bed_file(self, variantcontextdata, bedoutloc, vbuuid): + """Writes variant contexts as a BED file + + Parameters + ---------- + variantcontextdata : list of VariantContext + Variants contexts to write to BED file + bedoutloc : str + Path to write the BED file to + """ try: with open(bedoutloc, "w") as bedoutfile: + bedoutfile.write(f"#VBUUID: {vbuuid}\n") for varcon in variantcontextdata: bedoutfile.write(f"{varcon.get_variant_context_chrom()}\t{varcon.get_variant_context_start()}\t" f"{varcon.get_variant_context_end()}\t{varcon.get_variant_context_id()}\n") except IOError: self.vaselogger.warning(f"Could not write variant context data to BED file: {bedoutloc}") - # Checks that thet sequence names are the same def check_sequence_names(self, referencefile, alignmentfile): + """Checks and returns whether the chromosome names in the genome reference and alignment file are the same. + + Parameters + ---------- + referencefile: + alignmentfile: + + Returns + ------- + + """ reference_seqnames = self.get_reference_sequence_names(referencefile) alignment_seqnames = self.vb_scanner.get_alignment_sequence_names(alignmentfile) shared_seqnames = reference_seqnames & alignment_seqnames if len(shared_seqnames) < len(reference_seqnames) or len(shared_seqnames) < len(alignment_seqnames): self.vaselogger.warning("Reference file and alignment file do not contain the same sequence names") - # Returns the sequence names from the reference genome fasta file def get_reference_sequence_names(self, reference_fileloc): + """Returns the sequence names from the reference genome fasta file + + Parameters + ---------- + reference_fileloc: str + Path to genomic reference fasta file + + Returns + ------- + reference_seqnames : list of str + Extracted chromosome names + """ reference_seqnames = set() try: with open(reference_fileloc, "r") as reference_file: @@ -974,8 +1375,28 @@ def get_reference_sequence_names(self, reference_fileloc): exit() return reference_seqnames - # Adds the donor fastq file while making sure each read is only added once def add_donor_fastq3(self, opened_outfile, donorfastqfile, donoridlist): + """Adds a donor fastq file to an already opened validation fastq file. + + Reads from a specified donor fastq file are added to an opened validation fastq file if the reads had not been + added to a validation fastq file. If already added (the read identifier is in the set of donor read ids), the + donor read is skipped. After adding a donor read is written to the validation fastq file, the read identifier is + added to the donor id list, ensuring every read is only added once. + + Parameters + ---------- + opened_outfile : File + Already opened validation fastq file to write data to + donorfastqfile : str + Path to donor fastq file to add + donoridlist : set of str + Set of donor read identifiers already added to the validation fastq files + + Returns + ------- + donoridlist : set of str + Updated set of donor read identifiers already added to validation set + """ try: with io.BufferedReader(gzip.open(donorfastqfile, "rb")) as donorfastq: for fileline in donorfastq: @@ -1000,16 +1421,42 @@ def add_donor_fastq3(self, opened_outfile, donorfastqfile, donoridlist): finally: return donoridlist - # Builds a validation set using existing donor fastq files def build_validation_from_donor_fastqs(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath): + """Builds a validation set using existing donor fastq files. + + Parameters + ---------- + afq1_in : list of str + R1 acceptor fastq files to use as template + afq2_in : list of str + R2 acceptor fastq files to use as template + dfq1_in : list of str + R1 donor fsatq files to add + dfq2_in : list of str + R2 donor fastq files to add + varconfileloc : str + Path to existing variant context file to build validation set + outpath :str + """ varconfile = VariantContextFile(varconfileloc) skip_list = varconfile.get_all_variant_context_acceptor_read_ids() skip_list.sort() for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]): self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) - # Splits a list of donor fastq files over a number of acceptors def divide_donorfastqs_over_acceptors(self, list_of_donorfqs, num_of_acceptors): + """Divides a list of donor fastq files over a number of acceptor fastq files. + + Donor fastq files are divided equally into a specified number of groups. The number of groups is specified by + the number of acceptor fastq files to use as template. + + Parameters + ---------- + list_of_donorfqs: list of str + Paths to donor fastq files + num_of_acceptors: int + Number of template fastq files + """ cycler = 0 split_donors = [[] for x in range(num_of_acceptors)] copy_list_donorfqs = list(list_of_donorfqs) @@ -1019,15 +1466,36 @@ def divide_donorfastqs_over_acceptors(self, list_of_donorfqs, num_of_acceptors): cycler += 1 return split_donors - # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING + # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING DONOR FASTQS def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toexclude, forward_reverse, outpath): + """Builds a set of R1/R2 validation fastq files using already existing donor fastq files. + + The provided acceptor fastq files are filtered and used as template for the valdiation fastq files. Already + existing fastq files are added. + + Parameters + ---------- + acceptor_fqin: str + Path to acceptor fastq file to use as template + donor_fqin: list of str + Paths to donor fastq files to add + acceptor_reads_toexclude: liust of str + Acceptor reads to exclude from validation set + forward_reverse: str + Write R1 or R2 + outpath: str + Path to write the validation fastq file to + """ donor_sets = self.divide_donorfastqs_over_acceptors(donor_fqin, len(acceptor_fqin)) donorids = set() + + self.vaselogger.info(f"Start building validation R{forward_reverse} fastq files") for x in range(len(acceptor_fqin)): fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x+1) fqoutfile = io.BufferedWriter(open(fqoutname, "wb")) # Start writing the filtered acceptor file to a new output file + self.vaselogger.info(f"Start building fastq file {fqoutname}") try: acceptorfastq = io.BufferedReader(gzip.open(acceptor_fqin[x], "rb")) for fileline in acceptorfastq: @@ -1050,7 +1518,1050 @@ def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toe except IOError: self.vaselogger.critical(f"Acceptor fastq {acceptor_fqin[x]} could not be opened") - # Add the donor fastq files + # Add the donor fastq files. + self.vaselogger.info(f"Add {len(donor_sets[x])} donor fastq files to {fqoutname}") for donorfastqfile in donor_sets[x]: + self.vaselogger.debug(f"Adding {donorfastqfile} to {fqoutname}") self.add_donor_fastq3(fqoutfile, donorfastqfile, donorids) fqoutfile.close() + + # A-mode: Filters acceptors and adds already existing donor fastq files + def run_a_mode(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath): + self.vaselogger.info("Running VaeBuilder A-mode") + varconfile = VariantContextFile(varconfileloc) + skip_list = varconfile.get_all_variant_context_acceptor_read_ids() + skip_list.sort() + for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]): + self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) + + def run_ac_mode(self, afq1_in, afq2_in, dfqs, varconfile, outpath): + """Runs VaSeBuilder AC-mode. + + This run mode builds a set of validation fastq files by adding already existing fastq files to acceptor fastq + files used as template. The variant context file used to construct the variant contexts of the donor fastq files + is required to filter out the acceptor reads.to be replaced with the donor data. + + Parameters + ---------- + afq1_in : list of str + R1 acceptor fastq files to use as template + afq2_in : list of str + R2 acceptor fastq files to use as template + dfqs : list of list of str + R1 and R2 donor fastq files to add + varconfile: VariantContextFile + Variant context to use for filtering out acceptor reads + outpath: str + Path to folder to write the output to + """ + self.vaselogger.info("Running VaSeBuilder A-mode") + # Split the donor fastqs into an R1 and R2 group + r1_dfqs = [dfq[0] for dfq in dfqs] + r2_dfqs = [dfq[1] for dfq in dfqs] + + skip_list = varconfile.get_all_variant_context_acceptor_read_ids() + skip_list.sort() + + for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [r1_dfqs, r2_dfqs]): + self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) + + def run_d_mode(self, variantcontextfile, fq_out): + """Runs VaSeBuilder D-mode. + + This run mode produces one R1 and R2 file with donor reads of all variant contexts. This mode does not produce + validation fastq files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variants context to use + fq_out : str + Path and suffix to write donor fastq files to + """ + self.vaselogger.info("Running VaSeBuilder D-mode") + # Combine all donor reads from all variant contexts + add_list = variantcontextfile.get_all_variant_context_donor_reads() + self.vaselogger.info("Writing FastQ files.") + # Write the donor fastq files. + for i in ["1", "2"]: + self.vaselogger.info(f"Start writing the R{i} FastQ files.") + fq_starttime = time.time() + fqoutpath = self.set_fastq_out_path(fq_out, i, 1) + self.build_donor_fq(add_list, i, fqoutpath) + self.vaselogger.info(f"Wrote all R{i} FastQ files.") + self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") + self.vaselogger.info("Finished writing donor FastQ files.") + + def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out, random_seed): + """Runs VaSeBuilder F-mode. + + This run mode creates a full set of validation fastq files from established variant contexts and a set of + acceptor fastq files to use as template to create the validation fastq files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variant contexts to use + fq1_in : list of str + R1 fastq files to use as template + fq2_in : list of str + R2 fastq files to use as template + fq_out : list of str + Path and suffix to write validation fastq files to + """ + self.vaselogger.info("Running VaSeBuilder F-mode") + + # Combine all donor reads from all variant contexts + add_list = variantcontextfile.get_all_variant_context_donor_reads() + self.vaselogger.info("Writing FastQ files.") + skip_list = set(variantcontextfile.get_all_variant_context_acceptor_read_ids()) + # skip_list = list(set(variantcontextfile.get_all_variant_context_acceptor_read_ids())) + + donor_read_add_data = {} + for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): + # Write the fastq files. + self.vaselogger.info(f"Start writing the R{i} FastQ files.") + fq_starttime = time.time() + self.build_fastq_v2(fq_i, skip_list, add_list, i, fq_out, random_seed, donor_read_add_data) + self.vaselogger.info(f"Wrote all R{i} FastQ files.") + self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") + self.vaselogger.info("Finished writing FastQ files.") + self.write_donor_insert_positions_v2(donor_read_add_data, f"{fq_out}_donor_read_insert_positions.txt") + + def run_p_mode(self, variantcontextfile, outpath, fq_out): + """Runs VaSeBuilder P-mode. + + This run mode produces an R1 and R2 fastq file with donor reads for each variant context. This mode does not + create or write validation fastq files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Established variant contexts + outpath : str + Path to folder to write output files to + fq_out : str + Path and suffix to write fastq out files to + """ + context_fq_links = {} + + self.vaselogger.info("Running VaSeBuilder P-mode") + self.vaselogger.info("Begin writing variant FastQ files.") + variantcontexts = variantcontextfile.get_variant_contexts() + for context in variantcontexts: + add_list = context.get_donor_read_strings() + self.vaselogger.debug(f"Writing variant FastQs for variant {context.context_id}.") + + r1_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "1", 1) + r2_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "2", 1) + self.build_donor_fq(add_list, "1", r1_donorfq) + self.build_donor_fq(add_list, "2", r2_donorfq) + context_fq_links[context.context_id] = [r1_donorfq, r2_donorfq] + self.vaselogger.info("Finished writing variant FastQ files.") + + # Write the P-mode link file (links context to fastq files for the current run) + self.write_pmode_linkfile(outpath, context_fq_links) + + def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomereference, outdir, varconout, + variantlist): + """Runs VaSeBuilder X-mode. + + This run mode only produces the variant contexts and writes the associated output files. This mode does not + create or write validation fastq files. + + Parameters + ---------- + sampleidlist : list of str + Names/identifiers of samples to process + donorvcfs : dict + Donor variant files per sample + donorbams : dict + Donor alignment files per sample + acceptorbam : str + Path to alignment file to use as acceptor + genomereference : str + Path to genome reference fasta file + outdir : str + Path to write output files to + varconout : str + Path to folder to write the variant context file to + variantlist : dict + Variants to process per sample + """ + self.vaselogger.info("Running VaSeBuilder X-mode") + self.build_varcon_set(sampleidlist, donorvcfs, donorbams, acceptorbam, outdir, genomereference, varconout, + variantlist) + + # =====SPLITTING THE BUILD_VARCON_SET() INTO MULTIPLE SMALLER METHODS===== + def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath, + variantlist): + """Builds and returns a variant context file. + + The variant context file is build from the provided variants and acceptor and donors alignment files. + + Parameters + ---------- + sampleidlist : list of str + Sample name/identifier + vcfsamplemap : dict + Variant files per sample + bamsamplemap : dict + Alignment files per sample + acceptorbamloc : str + Path to alignment file to use as acceptor + outpath : str + Path to folder to write output files to + reference_loc : str + Path to the genomic reference fasta file + varcon_outpath : str + Path and name to write variant context file to + variantlist : dict + Variants to use per sample + + Returns + ------- + variantcontexts : VariantContextFile + Established variant contexts + """ + donor_vcfs_used = [] + donor_bams_used = [] + variantcontexts = VariantContextFile() + + try: + acceptorbamfile = pysam.AlignmentFile(acceptorbamloc, reference_filename=reference_loc) + except IOError: + self.vaselogger.critical("Could not open Acceptor BAM/CRAM") + exit() + + # Start iterating over the samples + for sampleid in sampleidlist: + self.vaselogger.debug(f"Start processing sample {sampleid}") + sample_variant_filter = self.bvcs_set_variant_filter(sampleid, variantlist) + samplevariants = self.get_sample_vcf_variants(vcfsamplemap[sampleid], sample_variant_filter) + + if not samplevariants: + self.vaselogger.warning(f"No variants obtained for sample {sampleid}. Skipping sample") + continue + + # Call the method that will process the sample + self.bvcs_process_sample(sampleid, variantcontexts, acceptorbamfile, bamsamplemap[sampleid], + reference_loc, samplevariants) + + # Add the used donor VCF and BAM to the lists of used VCF and BAM files + donor_bams_used.append(vcfsamplemap[sampleid]) + donor_vcfs_used.append(bamsamplemap[sampleid]) + + # Check if there are no variant contexts. + if variantcontexts.get_number_of_contexts() <= 0: + self.vaselogger.info("No variant contexts were created. No output files will be written.") + return None + + # Write the output variant context output data + self.bvcs_write_output_files(outpath, varcon_outpath, variantcontexts, vcfsamplemap, bamsamplemap, + donor_vcfs_used, donor_bams_used) + + # Checks whether the program is running on debug and, if so, write some extra output files. + if self.vaselogger.getEffectiveLevel() == 10: + self.write_optional_output_files(outpath, variantcontexts) + return variantcontexts + + def bvcs_process_sample(self, sampleid, variantcontextfile, abamfile, dbamfileloc, referenceloc, samplevariants): + """Processes a sample and adds variant contexts to a variant context file. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantcontextfile : VariantContextFile + Variant context file that saves variant contexts + abamfile: pysam.AlignmentFile + Already opened pysam AlignmentFile to use as acceptor + dbamfileloc: str + Path to the alignment file to use as donor + referenceloc: str + Path to the genomic reference fasta file + samplevariants : list of VcfVariants + Variants to process for the specified sample + """ + try: + donorbamfile = pysam.AlignmentFile(dbamfileloc, reference_filename=referenceloc) + except IOError: + self.vaselogger.warning(f"Could not open {dbamfileloc} ; Skipping {sampleid}") + return + + # Iterate over the sample variants + for samplevariant in samplevariants: + variantcontext = self.bvcs_process_variant(sampleid, variantcontextfile, samplevariant, abamfile, + donorbamfile, True) + if not variantcontext: + self.vaselogger.info(f"Could not establish variant context ; Skipping.") + continue + + # If this widest context overlaps an existing variant context, skip it. + if self.contexts.context_collision(variantcontext.get_context()): + self.vaselogger.debug(f"Variant context {variantcontext.get_variant_context_id()} overlaps with an" + f"already existing variant context; Skipping.") + continue + variantcontextfile.add_existing_variant_context(variantcontext.get_variant_context_id(), variantcontext) + + # Processes a sample variant by establishing the contexts. + def bvcs_process_variant(self, sampleid, variantcontextfile, samplevariant, abamfile, dbamfile, write_unm=False): + """Processes a variant and returns the established variant context. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantcontextfile: VariantContextFile + Variant context file that saves the variant contexts + samplevariant : VcfVariant + Variant to process and for which to establish an accpetor, donor and variant context + abamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile to use as acceptor + dbamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile to use as donor + write_unm : bool + Whether to save read identifiers with unmapped read mates + + Returns + ------- + vcontext : VariantContext or None + The established variant context, None if context could not be established + """ + # Establish the donor context + variantid = samplevariant.get_variant_id() + variantchrom = samplevariant.get_variant_chrom() + variantpos = samplevariant.get_variant_pos() + varianttype = samplevariant.get_variant_type() + + self.vaselogger.debug(f"Processing variant {variantid}.") + self.debug_msg("vw", variantid) + self.vaselogger.debug(f"Variant {variantid} determined to be {varianttype}") + searchwindow = self.determine_read_search_window(varianttype, samplevariant) + self.vaselogger.debug(f"Search window determined to be {variantchrom}:{searchwindow[0]+1}-{searchwindow[1]}") + + # Check whether the variant overlaps with an existing variant context + if variantcontextfile.variant_is_in_context(varianttype, variantchrom, *searchwindow): + self.vaselogger.debug(f"Variant {variantid} is located in an existing context.") + return None + + # Determine the donor context. + self.debug_msg("dc", variantid) + t0 = time.time() + dcontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, dbamfile, + write_unm) + if not dcontext: + self.vaselogger.info(f"Could not establish donor context ; Skipping variant {variantid}") + return None + self.debug_msg("dc", variantid, t0) + self.vaselogger.debug(f"Donor context determined to be {dcontext.get_context_chrom()}:" + f"{dcontext.get_context_start()}-{dcontext.get_context_end()}") + + # Determine the acceptor context. + self.debug_msg("ac", variantid) + t0 = time.time() + acontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, abamfile, + write_unm, dcontext.get_context()) + self.debug_msg("ac", variantid, t0) + self.vaselogger.debug(f"Acceptor context determined to be {acontext.get_context_chrom()}:" + f"{acontext.get_context_start()}-{acontext.get_context_end()}") + + # Determin the variant context. + self.debug_msg("cc", variantid) + t0 = time.time() + vcontext = self.bvcs_establish_variant_context(sampleid, variantcontextfile, variantid, variantchrom, + variantpos, acontext, dcontext, abamfile, dbamfile, write_unm) + self.debug_msg("cc", variantid, t0) + self.vaselogger.debug(f"Combined context determined to be {vcontext.get_variant_context_chrom()}:" + f"{vcontext.get_variant_context_start()}-{vcontext.get_variant_context_end()}") + return vcontext + + # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. + def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid, variantchrom, variantpos, + acontext, dcontext, abamfile, dbamfile, write_unm=False): + """Establishes and returns a variant context. + + The variant context window is established based on the acceptor and donor contexts. Reads and mates overlapping + with the variant context are also fetched and added to the variant context. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantcontextfile : VariantContextFile + Variant context file that saves variant contexts + variantid : str + Variant identifier + variantchrom : str + Chromosome the + variantpos: int + Leftmost genomic position of the variant + acontext : OverlapContext + Established acceptor context + dcontext : OverlapContext + Established donor context + abamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile used as acceptor + dbamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile used as donor + write_unm : bool + Whether to save identifiers of reads with unmapped mates + + Returns + ------- + variant_context : VariantContext + Established variant context with all associated data + """ + unmapped_alist = [] + unmapped_dlist = [] + + # Determine the variant context from the acceptor and donor context + vcontext_window = self.determine_largest_context(variantpos, acontext.get_context(), dcontext.get_context()) + if variantcontextfile.context_collision(vcontext_window): + self.vaselogger.debug(f"Variant context {variantid} overlaps with an already existing context; Skipping.") + return None + + # Gather variant context donor reads. + self.debug_msg("cdr", variantid) + t0 = time.time() + vcontext_dreads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3], + dbamfile, write_unm, unmapped_dlist) + self.debug_msg("cdr", variantid, t0) + + # Gather variant context acceptor reads. + self.debug_msg("car", variantid) + t0 = time.time() + vcontext_areads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3], + abamfile, write_unm, unmapped_alist) + self.debug_msg("car", variantid, t0) + + variant_context = VariantContext(variantid, sampleid, *vcontext_window, vcontext_areads, vcontext_dreads, + acontext, dcontext) + + # Set the variant context unmapped read ids + variant_context.set_unmapped_acceptor_mate_ids(unmapped_alist) + variant_context.set_unmapped_donor_mate_ids(unmapped_dlist) + return variant_context + + # Establishes an acceptor/donor context by fetching reads (and their mates) overlapping directly with the variant + def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, searchwindow, bamfile, + write_unm=False, + fallback_window=None): + """Establishes and returns an acceptor/donor context. + + The context window is established from fetched reads and their mates overlapping with the variant. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantid : str + Variant indentifier + variantchrom : str + Chromosome name the variant is located on + variantpos : int + Leftmost genomic position of the variant + searchwindow: list of int + Sreach window to use for fetching reads + bamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile + write_unm: bool + Save the read identifiers with unmapped mates + fallback_window : list + Window to set if no window could be set/determined + + Returns + ------- + OverlapContext or None + Acceptor/donor context if context is established, None if not + """ + unmappedlist = [] + + # Fetch context reads and establish the context window + self.vaselogger.debug("Fetching reads.") + t0 = time.time() + context_reads = self.get_variant_reads(variantid, variantchrom, *searchwindow, bamfile, write_unm, unmappedlist) + self.vaselogger.debug(f"Fetching reads took {time.time() - t0} seconds") + + if not context_reads: + self.vaselogger.debug("No reads were found.") + context_window = self.determine_context(context_reads, variantpos, variantchrom) + + # Check whether the context_window is valid and, if not, whether a fallback window has been set + if not context_window: + self.vaselogger.debug("Context window could not be determined.") + if fallback_window: + context_window = fallback_window + self.vaselogger.debug("Context window is set to the fall back window.") + else: + return None + + # Create the context object containing all context data + adcontext = OverlapContext(variantid, sampleid, *context_window, context_reads) + adcontext.set_unmapped_mate_ids(unmappedlist) + return adcontext + + # Sets the variant list for a sample if applicable, otherwise return None + def bvcs_set_variant_filter(self, sampleid, variant_list): + """Sets the variant list for a sample if applicable, otherwise return None + + Parameters + ---------- + sampleid : str + Sample name/identifier + variant_list : dict + Variant tuples with chromosome name and genomic position per sample + + Returns + ------- + list of tuple + Variants tuples with chromosome name and position + """ + sample_variant_filter = None + if variant_list is not None: + if sampleid in variant_list: + sample_variant_filter = variant_list[sampleid] + else: + sample_variant_filter = [] + return sample_variant_filter + + def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, vcfsamplemap, bamsamplemap, + used_donor_vcfs, used_donor_bams): + """Writes VaSeBuilder output files. + + These output files are the standard output files when building a variant context file. Output files include + the variant context file, variant context statistics file, variant context BED file, used donor variant files + and used donor alignment files. + + Parameters + ---------- + outpath: str + Path to folder to write the output files to + varcon_outpath : str + Path to write variant context file to + variantcontextfile : VariantContextFile + Variant contexts to write to file + vcfsamplemap : dict + Variant files per sample + bamsamplemap : dict + Alignment files per sample + used_donor_vcfs : list of str + Donor variant files used to create the variant contexts + used_donor_bams : list of str + Donor alignment files used to create the variant contexts + """ + # Write the variant context file itself + self.vaselogger.info(f"Writing variant contexts to {varcon_outpath}") + variantcontextfile.write_variant_context_file(varcon_outpath, self.creation_id) + + # Write the variant context statistics file + self.vaselogger.info(f"Writing variant context statistics to {outpath}varconstats.txt") + variantcontextfile.write_variant_context_stats(f"{outpath}varconstats.txt", self.creation_id) + + # Write the variant contexts as a BED file + self.vaselogger.info(f"Writing the variant contexts to a BED file") + self.write_bed_file(variantcontextfile.get_variant_contexts(), f"{outpath}variantcontexts.bed", + self.creation_id) + + # Write a listfile of the used variant (VCF/BCF) files + self.vaselogger.info(f"Writing the used donor variant files per sample to {outpath}donor_variant_files.txt") + self.write_used_donor_files(f"{outpath}donorvcfs.txt", vcfsamplemap, used_donor_vcfs, self.creation_id) + + # Write a listfile of the used alignment (BAM/CRAM) files + self.vaselogger.info(f"Writing the used donor alignment files per sample to {outpath}donor_alignment_files.txt") + self.write_used_donor_files(f"{outpath}donor_alignment_files.txt", bamsamplemap, used_donor_bams, + self.creation_id) + + def write_pmode_linkfile(self, outpath, context_fqs_links): + """Writes the link from variant context identifier to fastq output files for P-mode. + + This output file is meant to link a VaSeBuilder run with fastq output files. + + Parameters + ---------- + outpath : str + Path to output folder to write the link file to. + context_fqs_links : dict of str: list + Fastq files per variant context identifier + """ + plinkloc = f"{outpath}plink_{self.creation_id}.txt" + try: + with open(plinkloc, "w") as plinkfile: + plinkfile.write(f"#VBUUID: {self.creation_id}\n") + for contextid, fqfiles in context_fqs_links.items(): + plinkfile.write(f"{contextid}\t{fqfiles[0]}\t{fqfiles[1]}\n") + except IOError: + self.vaselogger.warning(f"Could not write P-mode link file {plinkloc} for run {self.creation_id}") + + def shuffle_donor_read_identifiers(self, donorreads, s=2): + """Shuffles and returns a list of donor read identifiers. + + Prior to shuffling the donor read identifiers, the provided donor read list is copied to preserve the original. + The new list is then sorted and a seed is set to ensure that the shuffling can be reproduced if hte same data + is used. + + Parameters + ---------- + donorreads : + Donor read identifiers to shuffle + s: int + Seed to set for shuffling (default = 2) + + Returns + ------- + shuffled_donor_reads : list of str + Shuffled donor read identifiers + """ + shuffled_donor_reads = donorreads.copy() + shuffled_donor_reads.sort() + + random.seed(s) + random.shuffle(shuffled_donor_reads) + return shuffled_donor_reads + + def shuffle_donor_add_positions(self, num_of_template_reads, num_of_donor_reads, s=2): + """Shuffle positions to + + Parameters + ---------- + num_of_template_reads : int + Number of template reads in the acceptor + num_of_donor_reads : int + Number of donor reads to be added + s : int + Seed to set for shuffling (default = 2) + + Returns + ------- + add_positions : list of int + Shuffled positions in fastq file to add donor reads to + """ + random.seed(s) + self.vaselogger.debug(f"Semi random donor add positions seed set to {s}") + add_positions = [] + + # Check whether the number of donor reads to add exceeds the number of acceptor reads in the template. + if num_of_donor_reads > num_of_template_reads: + self.vaselogger.debug("Found more donor reads to add than") + random_sample_size = num_of_donor_reads + while random_sample_size > 0: + pos_to_add = [] + if random_sample_size >= num_of_template_reads: + pos_to_add = random.sample(range(0, num_of_template_reads), num_of_template_reads) + else: + pos_to_add = random.sample(range(0, ), random_sample_size) + add_positions.extend(pos_to_add) + random_sample_size = random_sample_size - num_of_template_reads + return add_positions + add_positions = random.sample(range(0, num_of_template_reads), num_of_donor_reads) + return add_positions + + def write_donor_output_bam(self, bamoutpath, donorreads, acceptorbam): + """Writes a set of donor reads as a bam file. + + Parameters + ---------- + bamoutpath : str + Path and name to write the output to + donorreads : list of DonorBamRead + Donor reads to place in the output BAM file + acceptorbam + """ + print("Constructing BAM file") + + def read_is_hard_clipped(self, fetchedread): + """Returns whether the provided read is hard-clipped + + Parameters + ---------- + fetchedread : pysam.AlignedSegment + Read fetched from alignment file with pysam + + Returns + ------- + bool + True if read is hardclipped, False if not or read has no cigar string + """ + if fetchedread.cigarstring is not None: + return "H" in fetchedread.cigarstring + return False + + def check_template_size(self, templatefqloc): + """Returns the number of reads in the template fastq file. + + The returned number is divided by 4 as each read entry consists of four lines. + + Parameters + ---------- + templatefqloc : str + Path to acceptor fastq file to check + + Returns + ------- + int + Number of read entries in the template fastq file + """ + line_count = 0 + with gzip.open(templatefqloc, "r") as templatefq: + for fileline in templatefq: + line_count += 1 + return int(line_count/4) + + def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_context_reads, forward_or_reverse, + vasefq_outpath, random_seed, donor_read_insert_data): + """Builds and writes a set of validation fastq files. + + A set of validation fastq files is build using a set of template/acceptor fastq files. Acceptor reads will be + filtered from these file via read identifier and donor reads will be added. Donor readsd reads will be added at + semi random positions. The random.seed() method is used to ensure reproducibility as using the same data (in + the same order) and the same seed results in the same shuffle. + + Parameters + ---------- + acceptorfq_filepaths : list of str + Paths to template fastq files to use + acceptorreads_toskip : list of str + Identifiers of acceptor reads to skip + donor_context_reads : list of tuple + Donor reads to add to the validation fastq files + forward_or_reverse : str + Write forward ('1') or reverse ('2') + vasefq_outpath : + Path to write VaSeBuilder validation fastq files to + """ + # Split all donor reads to add over the template fastq files + donor_read_ids = [x[0] for x in donor_context_reads] + donor_read_ids.sort() + donor_read_ids = list(set(donor_read_ids)) + self.vaselogger.debug(f"Donor read ids: {donor_read_ids}") + distributed_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(acceptorfq_filepaths)) + self.vaselogger.debug(f"Distributed read ids: {distributed_read_ids}") + + # Iterate over the R1/R2 fastq in files to use as templates for the + for x in range(0, len(acceptorfq_filepaths)): + # Collect the donor reads to write. + add_donor_ids = distributed_read_ids[x] + self.vaselogger.debug(f"Distributed read ids: {add_donor_ids}") + add_donor_reads = [x for x in donor_context_reads if x[0] in add_donor_ids] + self.vaselogger.debug(f"Will add {len(add_donor_ids)} donor reads") + + # Write the new VaSe FastQ file. + vasefq_outname = self.set_fastq_out_path(vasefq_outpath, forward_or_reverse, x + 1) + self.vaselogger.debug(f"Set FastQ output path to: {vasefq_outname}") + + # Check whether to add the fastq file name into the donor read insert position map + if vasefq_outname.split(".")[0][:-3] not in donor_read_insert_data: + donor_read_insert_data[vasefq_outname.split(".")[0][:-3]] = {} + + self.write_vase_fastq_v2(acceptorfq_filepaths[x], vasefq_outname, + acceptorreads_toskip, add_donor_reads, add_donor_ids, + forward_or_reverse, random_seed, donor_read_insert_data) + + # Builds a new FastQ file to be used for validation. + def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, + acceptorreads_toskip, donorbamreaddata, + donor_readids, fr, random_seed, donor_read_insert_data): + """Creates and writes a single VaSeBuilder validation fastq file. + + Parameters + ---------- + acceptor_infq : str + Path to + fastq_outpath : str + Path to write VaSeBuilder vaildation fastq file to + acceptorreads_toskip : list of str + Acceptor read identifiers to exclude from the validation fastq file + donorbamreaddata : list of tuple + Donor reads to writes + fr : str + Forward('1') or reverse ('2') fastq file + """ + fastq_prefix = fastq_outpath.split(".")[0][:-3] + try: + fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) + self.vaselogger.debug(f"Writing data to validation fastq {fastq_outpath}") + + cur_line_index = 0 # Current read position in the template fastq + cur_add_index = 0 # Current read position in the validation fastq + + # Determine where to semi randomly add the donor reads in the fastq + num_of_template_reads = self.check_template_size(acceptor_infq) + self.vaselogger.debug(f"Template has {num_of_template_reads} reads") + donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donor_readids), + random_seed) + # self.vaselogger.debug(f"Add positions for {fastq_outpath} = {donor_add_positions}") + donor_reads_to_addpos = self.link_donor_addpos_reads_v2(donor_add_positions, donor_readids, + donorbamreaddata) + # self.vaselogger.debug(f"Read to add pos for {fastq_outpath} = {donor_reads_to_addpos}") + + # Open the template fastq and write filtered data to a new fastq.gz file. + fqgz_infile = io.BufferedReader(gzip.open(acceptor_infq, "rb")) + self.vaselogger.debug(f"Opened template FastQ: {acceptor_infq}") + for fileline in fqgz_infile: + + # Check if we are located at a read identifier. + if fileline.startswith(b"@"): + if fileline.decode("utf-8").split()[0][1:] not in acceptorreads_toskip: + fqgz_outfile.write(fileline) + fqgz_outfile.write(next(fqgz_infile)) + fqgz_outfile.write(next(fqgz_infile)) + fqgz_outfile.write(next(fqgz_infile)) + cur_add_index += 1 + else: + self.vaselogger.debug(f"Skipping acceptor read {fileline}") + + # Check if we need to add a donor read at the current position + if cur_line_index in donor_reads_to_addpos: + # self.vaselogger.debug(f"{cur_line_index} is in list of positions to add donor") + for donorread in donor_reads_to_addpos[cur_line_index]: + if donorread[1] == fr: + fqlines = ("@" + str(donorread[0]) + "\n" + + str(donorread[2]) + "\n" + + "+\n" + + str(donorread[3]) + "\n") + fqgz_outfile.write(fqlines.encode("utf-8")) + cur_add_index += 1 + self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at " + f"{cur_add_index}") + self.add_donor_insert_data(fastq_prefix, donorread[0], fr, cur_add_index, + donor_read_insert_data) + cur_line_index += 1 + fqgz_infile.close() + + fqgz_outfile.flush() + fqgz_outfile.close() + + except IOError as ioe: + if ioe.filename == acceptor_infq: + self.vaselogger.critical("The supplied template FastQ file " + "could not be found.") + if ioe.filename == fastq_outpath: + self.vaselogger.critical("A FastQ file could not be written " + "to the provided output location.") + exit() + + def link_donor_addpos_reads(self, donor_addpos, donor_reads): + """Links and returns donor add positions and donor reads. + + Parameters + ---------- + donor_addpos : list of int + Positions in validation fastq to add donor reads at + donor_reads : list of tuple + Donor reads to add to validation fastq + + Returns + ------- + add_posread_link : dict of list + Donor reads to add per add position + """ + add_posread_link = {} + for addpos, dread in zip(donor_addpos, donor_reads): + if addpos not in add_posread_link: + add_posread_link[addpos] = [] + add_posread_link[addpos].append(dread) + # self.vaselogger.debug(f"Added {dread[0]}/{dread[1]} at insert pos {addpos}") + return add_posread_link + + def link_donor_addpos_reads_v2(self, donor_addpos, donor_read_ids, donor_reads): + """Links and returns donor add positions and donor reads. + + Parameters + ---------- + donor_addpos : list of int + Positions in validation fastq to add donor reads at + donor_reads : list of tuple + Donor reads to add to validation fastq + + Returns + ------- + add_posread_link : dict of list + Donor reads to add per add position + """ + add_posread_link = {} + for addpos, dread_id in zip(donor_addpos, donor_read_ids): + if addpos not in add_posread_link: + add_posread_link[addpos] = [] + add_posread_link[addpos].extend([x for x in donor_reads if x[0] == dread_id]) + return add_posread_link + + def read_donor_fastq(self, donor_fastq, forward_reverse, donor_read_data): + """Reads and saves the donor fastq reads. + + Parameters + ---------- + donor_fastq : str + Path to the donor fastq file to read + forward_reverse : str + Reading forward (R1) or reverse (R2) donor read data + donor_read_data : dict + Dictionary saving the donor read data + + Returns + ------- + donor_read_data : dict + Updated dictionary containing donor read data + """ + try: + with gzip.open(donor_fastq, 'rt') as dfqfile: + for dfqfileline in dfqfile: + if dfqfileline.startswith("@"): + dfq_readid = dfqfileline[1:].strip() + dfq_readseq = next(dfqfile).strip() + dfq_optline = next(dfqfile).strip() + dfq_readqual = next(dfqfile).strip() + if dfq_readid not in donor_read_data: + donor_read_data[dfq_readid] = [] + donor_read_data[dfq_readid].append((dfq_readid, forward_reverse, dfq_readseq, dfq_readqual)) + except IOError: + self.vaselogger.debug(f"Could not read donor fastq file {donor_fastq}") + finally: + return donor_read_data + + # Temporary new AC-mode method to test semi random read distribution + def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, random_seed, outpath): + self.vaselogger.info("Running VaSeBuilder AC-mode") + # Split the donor fastqs into an R1 and R2 group + r1_dfqs = [dfq[0] for dfq in dfqs] + r2_dfqs = [dfq[1] for dfq in dfqs] + + # Get the list of acceptor reads to exclude + skip_list = varconfile.get_all_variant_context_acceptor_read_ids() + skip_list.sort() + + # Read all the donor read fastq data + donor_reads = {} + for fr_i, dfq_i in zip(["1", "2"], [r1_dfqs, r2_dfqs]): + for x in range(len(dfq_i)): + donor_reads = self.read_donor_fastq(dfq_i[x], fr_i, donor_reads) + # self.vaselogger.debug(f"ACmodeV2 Donor reads: {donor_reads}") + + # Divide the donor reads equally over the template fastq files + donor_read_ids = list(donor_reads.keys()) + donor_read_ids.sort() + # self.vaselogger.debug(f"ACmodeV2 dread ids: {donor_read_ids}") + + # Distribute the read identifiers over the fastq files + distributed_donor_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(afq1_in)) + # self.vaselogger.debug(f"ACmodeV2 distr dread ids: {distributed_donor_read_ids}") + + # Iterate over the acceptor fastq files + donor_read_add_data = {} + for fr_i, afq_i in zip(["1", "2"], [afq1_in, afq2_in]): + self.build_fastqs_from_donors_v2(afq_i, skip_list, distributed_donor_read_ids, donor_reads, fr_i, + random_seed, outpath, donor_read_add_data) + self.write_donor_insert_positions_v2(donor_read_add_data, f"{outpath}_donor_read_insert_positions.txt") + + # Temporary new build_fastqs_from_donors() to test semi random read distribution in AC-mode + def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, distributed_donor_reads, + donor_reads, forward_reverse, random_seed, outpath, donor_read_insert_data): + """Builds a set of R1/R2 validation fastq files using existing donor fastq files. + + Parameters + ---------- + acceptor_fqsin: list of str + Template/Acceptor fastq files to use + acceptor_reads_to_exclude: list of str + distributed_donor_reads : list of list of str + donor_reads: dict + forward_reverse: str + Whether to construct an R1 or R2 fastq file + random_seed: int + Random seed to use for shuffling donor add positions + outpath: str + Path top write produced fastq file to + donor_read_insert_data: + """ + for x in range(len(acceptor_fqsin)): + fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x + 1) + self.vaselogger.debug(f"bfmdV2 xdistr dread ids: {distributed_donor_reads[x]}") + + # Add the fastq file entry to the insert positions map + if fqoutname.split(".")[0][:-3] not in donor_read_insert_data: + donor_read_insert_data[fqoutname.split(".")[0][:-3]] = {} + + # Filter the donor reads specific to the template file + selected_donor_reads = [donor_reads[y] for y in distributed_donor_reads[x]] + donor_reads_to_add = [] + [donor_reads_to_add.extend(x) for x in selected_donor_reads] + + # self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}") + self.write_vase_fastq_v2(acceptor_fqsin[x], fqoutname, acceptor_reads_to_exclude, donor_reads_to_add, + distributed_donor_reads[x], forward_reverse, random_seed, donor_read_insert_data) + + def write_donor_insert_positions_v2(self, inserted_position_data, outpath): + """Writes the insert positions for each set of reads. + + Insert positions are written per read identifier. + + Parameters + ---------- + inserted_position_data : dict + Position data + outpath : str + Path and name to write the donor read insert position data to + """ + try: + with open(outpath, "w") as ipd_outfile: + ipd_outfile.write(f"#VBUUID: {self.creation_id}\n") + ipd_outfile.write("ReadId\tR1_InsertPos\tFastqR1Out\tR2_InsertPos\tFastqR2Out\n") + + # Iterate over the donor read insert position data and write to file + for fastqout in inserted_position_data: + readid_list = [x for x in inserted_position_data[fastqout]] + readid_list.sort() + + # Iterate over each read identifier for the specified output R1 and R2 fastq set. + for readid in readid_list: + r1_insertpos = self.get_saved_insert_position("1", inserted_position_data[fastqout][readid]) + r2_insertpos = self.get_saved_insert_position("2", inserted_position_data[fastqout][readid]) + + ipd_outfile.write(f"{readid}\t{r1_insertpos}\t{fastqout}_R1.fastq\t{r2_insertpos}\t" + f"{fastqout}_R2.fastq\n") + except IOError: + self.vaselogger.warning(f"Could not write donor insert position data to {outpath}") + + def get_saved_insert_position(self, readpn, read_insert_data): + """Returns the insert position of a read, or 'NA' if not available. + + Parameters + ---------- + readpn : str + The readpair number ('1' or '2') + read_insert_data : tuple of str and int + The read insert data + + Returns + ------- + int or str + The insert position, 'NA' if not available + """ + if readpn in read_insert_data: + readpn_index = read_insert_data.index(readpn) + if readpn_index < len(read_insert_data)-1: + return read_insert_data[read_insert_data.index(readpn)+1] + return "NA" + + def add_donor_insert_data(self, fqoutname, readid, forward_reverse, insertpos, donor_insert_data): + """ + + Parameters + ---------- + fqoutname : str + Name of fastq file + readid : str + Read identifier for which to save the insert position + forward_reverse : str + Indicator whether the read is R1 or R2 + insertpos : int + Position the donor read was inserted at in the validation fastq + donor_insert_data: dict + Saved donor read insertions into validation fastq + """ + if fqoutname in donor_insert_data: + if readid in donor_insert_data[fqoutname]: + if forward_reverse not in donor_insert_data[fqoutname][readid]: + donor_insert_data[fqoutname][readid] = donor_insert_data[fqoutname][readid] + \ + (forward_reverse, insertpos) + else: + donor_insert_data[fqoutname][readid] = (forward_reverse, insertpos) diff --git a/VaSeUtils/SubsetVcfByVariantContext.py b/VaSeUtils/SubsetVcfByVariantContext.py new file mode 100644 index 0000000..9168293 --- /dev/null +++ b/VaSeUtils/SubsetVcfByVariantContext.py @@ -0,0 +1,11 @@ +import logging +from VariantContextFile import VariantContextFile + +class SubsetVcfByVariantContext: + def __init__(self): + print("aap") + + def main(self, varconinloc): + varconfile = VariantContextFile(varconinloc) + + diff --git a/VaSeUtils/VaSeUtilHelper.py b/VaSeUtils/VaSeUtilHelper.py index f0f33c2..be02d67 100644 --- a/VaSeUtils/VaSeUtilHelper.py +++ b/VaSeUtils/VaSeUtilHelper.py @@ -6,6 +6,39 @@ class VaSeUtilHelper: def __init__(self): self.vaseutillogger = logging.getLogger("VaSeUtil_Logger") + self.parameter_map = {"-m": "RUNMODE", + "--runmode": "RUNMODE", + "-v": "DONORVCF", + "--donorvcf": "DONORVCF", + "-b": "DONORBAM", + "--donorbam": "DONORBAM", + "-a": "ACCEPTORBAM", + "--acceptorbam": "ACCEPTORBAM", + "-1": "TEMPLATEFQ1", + "--templatefq1": "TEMPLATEFQ1", + "-2": "TEMPLATEFQ2", + "--templatefq2": "TEMPLATEFQ2", + "-o": "OUT", + "--out": "OUT", + "-r": "REFERENCE", + "--reference": "REFERENCE", + "-of": "", + "--fastqout": "", + "-ov": "", + "--varcon": "", + "-l": "LOG", + "--log": "LOG", + "-!": "DEBUG", + "-vl": "VARIANTLIST", + "--variantlist": "VARIANTLIST", + "-iv": "VARCONIN", + "--varconin": "VARCONIN", + "-dq": "DONORFASTQS", + "--donorfastqs": "DONORFASTQS", + "-c": "CONFIG", + "--config": "CONFIG", + "-s": "SEED", + "--seed": "SEED"} # Returns whether something is in the filter or not def passes_filter(self, valtocheck, filterlist): @@ -116,3 +149,45 @@ def get_read_pair_num(self, pysam_bamread): if pysam_bamread.is_read1: return "1" return "2" + + def determine_variant_type(self, vcfvariantref, vcfvariantalts): + """Determines and returns the + + Parameters + ---------- + vcfvariantref : str + Variant reference allele(s) + vcfvariantalts : tuple of str + Variant alternative allele(s) + + Returns + ------- + str + Type of variant (snp/indel) + """ + maxreflength = max([len(x) for x in vcfvariantref.split(",")]) + maxaltlength = max([len(x) for x in vcfvariantalts]) + + # Check based on the reference and alternative lengths whether the variant is a SNP or indel. + if maxreflength == 1 and maxaltlength == 1: + return "snp" + elif maxreflength > 1 or maxaltlength > 1: + return "indel" + return "?" + + def get_config_param_name(self, paramflag): + """Returns the config parameter name for a parameter flag. + + Parmeters + --------- + paramflag : str + Parameter flag to obtain config parameter + + Returns + ------- + str + Config parameter name if parameter flag is in map, empty string otherwise + """ + if paramflag in self.parameter_map: + return self.parameter_map[paramflag] + return "" diff --git a/VaSeUtils/VaSeUtilRunner.py b/VaSeUtils/VaSeUtilRunner.py new file mode 100644 index 0000000..73bc9c8 --- /dev/null +++ b/VaSeUtils/VaSeUtilRunner.py @@ -0,0 +1,113 @@ +import os +from VariantContextFile import VariantContextFile +from VaSeUtilHelper import VaSeUtilHelper + + +class VaSeUtilRunner: + def __init__(self): + self.vuh = VaSeUtilHelper() + + def check_donor_files_exist(self, donorlistfile, samplefilter=None): + """Checks whether the donor files used in a VaSeBuilder run still exist. + + Parameters + ---------- + donorlistfile : str + File with list of used donor files n VaSeBuilder run + samplefilter : list of str + Filter with sample names + """ + missing_count = 0 + total_count = 0 + try: + with open(donorlistfile, "r") as dlfile: + next(dlfile) # Skip the header line of the file + for fileline in dlfile: + filelinedata = fileline.strip().split("\t") + donorfiles = filelinedata[1].split(",") + + # Check whether the used donor files exist + if self.vuh.passes_filter(filelinedata[0], samplefilter): + for dfile in donorfiles: + total_count += 1 + if not os.path.isfile(dfile): + missing_count += 1 + print(f"Donor file {dfile} for sample {filelinedata[0]} could not be found. Maybe it " + "has been moved, renamed or deleted?") + print(f"Found {total_count - missing_count}/{total_count} donor files.") + except IOError: + print(f"Could not open {donorlistfile}") + finally: + print("Finished running VaSe util CheckDonorFilesExist") + + def combine_variant_context_file(self): + print("aap") + + def generate_config_file_from_command(self, vasebuilder_command): + print("aap") + + def log_info(self, vaselogloc, logfilter): + """Displays log entries satisfying the set log level filter. + + Parameters + ---------- + vaselogloc: + Path to VaSeBuilder log file + logfilter: list of str + Filter with log level(s) + """ + try: + with open(vaselogloc, 'r') as vaselogfile: + for fileline in vaselogfile: + fileline_elements = fileline.split("\t") + if len(fileline_elements) >= 3: + if self.vuh.passes_filter(fileline_elements[2], logfilter): + print(fileline.strip()) + except IOError as ioe: + print(f"Could not open log file") + + def subset_vcf(self, varconfile, vcffileloc): + """Subsets a single VCF file. The filtered entries are written to a new file. + + varconfile : VariantContextFile + Variant contexts to use as filter + vcffileloc : str + Path to the VCF file to filter + """ + tmp_data = vcffileloc.split(".") + out_name = f"{tmp_data[0]}.varconfiltered." + ".".join(tmp_data[1:]) + try: + vcf_file = pysam.VariantFile(vcffileloc, "r") + filtered_vcf_file = pysam.VariantFile(out_name, "w", header=vcf_file.header) + + # Iterate over the VCF file that to be filtered nd write variants overlapping with a context + for vcfvariant in vcf_file.fetch(): + varianttype = self.vuh.determine_variant_type(vcfvariant.ref, vcfvariant.alts) + max_ref = max(len(x) for x in vcfvariant.ref.split(",")) + max_alt = max(len(x) for x in vcfvariant.alts) + var_endpos = max([max_ref, max_alt]) + if varconfile.variant_is_in_context(varianttype, vcfvariant.chrom, vcfvariant.start-1, + vcfvariant.stop+1): + filtered_vcf_file.write(vcfvariant) + + filtered_vcf_file.close() + vcf_file.close() + except IOError: + print(f"Could not process variant file {vcffileloc}") + + def subset_acceptor_vcf(self): + print("aap") + + def subset_vcf_by_variant_contexts(self, variantcontextfile, vcffile_list): + """Subsets a filter of VCF files using a variant context files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variant context file to use as filter + vcffile_list : list of str + List of VCF files to filter + """ + varconfile = VariantContextFile(variantcontextfile) + for vcffile in vcffile_list: + self.subset_vcf(varconfile, vcffile) diff --git a/VariantContext.py b/VariantContext.py index 467350c..ef0d0c5 100644 --- a/VariantContext.py +++ b/VariantContext.py @@ -3,64 +3,173 @@ class VariantContext: - # Sets the variant context data. + """Saves a variant context and allows data to be obtained and calculated. + + Attributes + ---------- + context_id : str + Identifier of the context + sample_id : str + The name/identifier of the donor sample used to construct the variant context + variant_context_chrom : str + Chromosome name the context is lcoated on + variant_context_origin : int + Variant position the context is constructed from + variant_context_start : int + Leftmost genomic position of the variant context + variant_context_end : int + Rightmost genomic position of the variant context + variant_context_areads : list of DonorBamRead + Acceptor reads associated with the variant context + variant_context_dreads : list of DonorBamRead + Donor reads associated with the variant context + variant_acceptor_context : OverlapContext + Acceptor context used to construct the variant context + variant_donor_context : OverlapContext + Donor context used to construct the variant context + unmapped_donor_mate_ids : list of str + Identifiers of reads with an unmapped mate + """ + def __init__(self, varconid, sampleid, varconchrom, varconorigin, varconstart, varconend, acceptorreads, donorreads, acceptor_context=None, donor_context=None): + """Saves the provided variant context data. + + Acceptor and donor contexts are optional when creating the variant context. + + Parameters + ---------- + varconid : str + Variant context identifier + sampleid : str + Sample name/identifier + varconchrom : str + Chromosome name the variant context is located on + varconorigin: + Variant position the context is constructed from + varconstart : int + Leftmost genomic position of the variant context + varconend : int + Rightmost genomic position of the variant context + acceptorreads : list of DonorBamRead + Variant context acceptor reads + donorreads : list of DonorBamRead + Variant context donor reads + acceptor_context : OverlapContext + Acceptor context associated with the variant context + donor_context : OverlapContext + Donor context associated with the variant context + """ self.context_id = varconid self.sample_id = sampleid self.variant_context_chrom = varconchrom self.variant_context_origin = int(varconorigin) self.variant_context_start = int(varconstart) self.variant_context_end = int(varconend) - # Saves the acceptor reads that overlap with the entire variant - # context. + # Saves the acceptor reads that overlap with the entire variant context. self.variant_context_areads = acceptorreads - # Saves the donor reads that overlap with the entire variant - # context. + # Saves the donor reads that overlap with the entire variant context. self.variant_context_dreads = donorreads - # Saves the acceptor context (this is the context from acceptor - # reads overlapping with the variant itself). + # Saves the acceptor context (this is the context from acceptor reads overlapping with the variant itself). self.variant_acceptor_context = acceptor_context - # Saves the donor context (this is the context from donor reads - # overlapping with the variant itself). + # Saves the donor context (this is the context from donor reads overlapping with the variant itself). self.variant_donor_context = donor_context self.unmapped_acceptor_mate_ids = [] self.unmapped_donor_mate_ids = [] # ===METHODS TO OBTAIN DATA FROM THE VARIANT CONTEXT DATA================== - # Returns the variant context identifier. + def get_context(self): + """Returns the essential data of the variant context. + + Returns + ------- + list of str and int + Variant context window + """ + return [self.variant_context_chrom, self.variant_context_origin, self.variant_context_start, + self.variant_context_end] + def get_variant_context_id(self): + """Returns the variant context identifier. + + Returns + ------- + self.context_id : str + Variant context identifier + """ return self.context_id - # Returns the variant context sample. def get_variant_context_sample(self): + """Returns the name/identifier of the sample + + Returns + ------- + self.sample_id : str + Sample name/identifier + """ return self.sample_id - # Returns the variant context chromosome. def get_variant_context_chrom(self): + """Returns the chromosome name the variant context is located on. + + Returns + ------- + self.variant_context_chrom : str + Chromosome name of the variant context + """ return self.variant_context_chrom - # Returns the variant context origin. def get_variant_context_origin(self): + """Returns the genomic position of the variant used to construct the variant context. + + Returns + ------- + self.variant_context_origin : int + Variant genomic position + """ return self.variant_context_origin - # Returns the variant context start position. def get_variant_context_start(self): + """Returns the leftmost genomic position of the variant context. + + Returns + ------- + self.variant_context_start + Variant context starting position + """ return self.variant_context_start - # Returns the variant context end position. def get_variant_context_end(self): + """Returns the variant context end position. + + Returns + ------- + self.variant_context_end : int + Variant context rightmost genomic position + """ return self.variant_context_end - # Returns the variant context acceptor reads. def get_acceptor_reads(self): + """Returns the variant context acceptor reads. + + Returns + ------- + self.variant_context_areads : list of DonorBamRead + Variant context acceptor reads + """ return self.variant_context_areads - # Returns the variant context donor reads. def get_donor_reads(self): + """Returns the variant context donor reads. + + Returns + ------- + self.variant_context_dreads : list of DonorBamRead + Variant context acceptor reads + """ return self.variant_context_dreads # TODO: New thing for -P mode. @@ -73,130 +182,264 @@ def get_donor_read_strings(self): dbr.get_bam_read_qual())) return list(set(donorreads)) - # Returns the acceptor context (the context from acceptor reads - # overlapping the variant itself). def get_acceptor_context(self): + """Returns the acceptor context used to construct the variant context. + + Returns + ------- + self.variant_acceptor_context : OverlapContext + Acceptor context associated with the variant context + """ return self.variant_acceptor_context - # Returns the donor context (the context from donor reads - # overlapping the variant itself). def get_donor_context(self): + """Returns the donor context used to construct the variant context. + + Returns + ------- + self.variant_donor_context : OverlapContext + Donor context associated with the variant context + """ return self.variant_donor_context - # Returns a list of acceptor reads overlapping with the variant - # context. def get_unmapped_acceptor_mate_ids(self): + """Returns the identifiers of variant context acceptor reads that have an unmapped mate. + + Returns + ------- + self.unmapped_acceptor_mate_ids : list of str + Variant context acceptor read identifiers with unmapped mates + """ return self.unmapped_acceptor_mate_ids - # Returns a list of donor reads overlapping with the variant - # context. + # Returns a list of donor reads overlapping with the variant context. def get_unmapped_donor_mate_ids(self): + """Returns the identifiers of variant context donor reads with an unmapped mate. + + Returns + ------- + self.unmapped_donor_mate_ids : list of str + Variant contexct donor read identifiers with an unmapped mate + """ return self.unmapped_donor_mate_ids - # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE=============== - # ===VARIANT CONTEXT=== - # Returns the variant context length. + # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE VARIANT CONTEXT=============== def get_variant_context_length(self): + """Returns the length of the variant context. + + The length if determined by subtracting the leftmost genomic (start) position from the rightmost genomic (end) + position. + + Returns + ------- + int + Variant context length + """ return abs(self.variant_context_end - self.variant_context_start) - # Returns the distance of the variant context start position from - # the variant context origin. def get_start_distance_from_origin(self): + """Calculates and returns the variant context start from the variant position that constructed it. + + Returns + ------- + int + Distance between variant context leftmost genomic and variant position. + """ return abs(self.variant_context_origin - self.variant_context_start) - # Returns the distance of the variant context end position from the - # variant context origin. def get_end_distance_from_origin(self): + """Calculates and returns the variant context end from the variant position tha constructed it. + + Returns + ------- + int + Distance between variant context rightmost genomic and variant position + """ return abs(self.variant_context_end - self.variant_context_origin) # ===METHODS TO OBTAIN VARIANT CONTEXT ACCEPTOR READ DATA================== - # Returns the number of variant context acceptor reads. def get_number_of_acceptor_reads(self): + """Returns the number of variant context acceptor reads. + + Returns + ------- + int + Number of variant context acceptor reads + """ if self.variant_context_areads is None: return 0 return len(self.variant_context_areads) - # Returns the identifiers of acceptor reads overlapping with the - # variant context. def get_acceptor_read_ids(self): + """Returns the variant context acceptor read identifiers + + Returns + ------- + list of str or None + Variant context acceptor read identifiers, None if there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return list(set([x.get_bam_read_id() for x in self.variant_context_areads])) - # Returns the list of left most acceptor read positions, def get_acceptor_read_starts(self): + """Returns the variant context acceptor read starting positions + + Returns + ------- + list of int or None + Variant context acceptor read leftmost genomic positions, None if there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.variant_context_areads] - # Returns a list of the left most positions of the R1 variant - # context accecptor BAM reads. def get_acceptor_read_left_positions(self): + """Returns the leftmost genomic positions of all variant context R1 acceptor reads. + + Returns + ------- + list of int or None + Variant context R1 acceptor read leftmost genomic positions, None of there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.variant_context_areads if x.is_read1()] - # Returns the list of all end positions for all variant context - # acceptor reads. def get_acceptor_read_ends(self): + """Returns the variant context acceptor read rightmost positions. + + Returns + ------- + list of int or None + Variant context R2 acceptor read rightmost genomic positions, None if there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_end() for x in self.variant_context_areads] - # Returns a list of the right most positions ofr the R2 variant - # context acceptor BAM read. def get_acceptor_read_right_positions(self): + """Returns the rightmost genomic positions for all variant context R2 acceptor reads + + Returns + ------- + list of int or None + Variant context + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_end() for x in self.variant_context_areads if x.is_read2()] # ===METHODS TO OBTAIN VARIANT CONTEXT DONOR READ DATA===================== - # Returns the number of variant context donor reads. def get_number_of_donor_reads(self): + """Returns the number of variant context donor reads. + + Returns + ------- + int + Number of variant context donor reads + """ return len(self.variant_context_dreads) - # Returns the identifiers of donor reads overlapping with the - # variant context. def get_donor_read_ids(self): + """Returns the identifiers of donor reads overlapping with the variant context. + + Returns + ------- + list of str + Variant context donor read identifiers + """ return list(set([x.get_bam_read_id() for x in self.variant_context_dreads])) - # Returns the list of variant context donor read starting positions. def get_donor_read_starts(self): + """Returns the list of variant context donor read starting positions. + + Returns + ------- + list of int + Variant context donor read leftmost genomic positions + """ return [x.get_bam_read_ref_pos() for x in self.variant_context_dreads] - # Returns the list of variant context donor read pairs left most - # positions (start pos of read 1). def get_donor_read_left_positions(self): + """Returns the list of variant context R1 donor read leftmost positions + + Returns + ------- + list of int + Variant context R1 donor read leftmost genomic positions + """ return [x.get_bam_read_ref_pos() for x in self.variant_context_dreads if (x.is_read1())] # Returns a list of all donor read ending positions def get_donor_read_ends(self): + """Returns variant context donor read rightmost positions. + + Returns + ------- + list of int + Variant context donor read rightmost genomic positions + """ return [x.get_bam_read_ref_end() for x in self.variant_context_dreads] - # Returns a list of all variant context donor reads right most - # positions (end pos of read 2). def get_donor_read_right_positions(self): + """Returns the list of variant context R2 donor reads + + Returns + ------- + list of int + Variant context R2 donor reads rightmost genomic positions + """ return [x.get_bam_read_ref_end() for x in self.variant_context_dreads if (x.is_read2())] # ===METHODS TO ADD DATA TO THE VARIANT CONTEXT============================ - # Sets the acceptor context of the variant context. def set_acceptor_context(self, acceptor_context): + """Sets the acceptor context of the variant context with the one provided. + + Parameters + ---------- + acceptor_context : OverlapContext + Acceptor context to add to the variant context + """ self.variant_acceptor_context = acceptor_context - # Sets the donor context of the variant context. def set_donor_context(self, donor_context): + """Sets the donor context of the variant context with the one provided. + + Parameters + ---------- + donor_context : OverlapContext + Donor context to add to the variant context + """ self.variant_donor_context = donor_context - # Adds an acceptor context to the variant context by creating it - # from data values. def add_acceptor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, acceptorreads): + """Sets the acceptor context of the variant context by constructing one from the provided parameters. + + Parameters + ---------- + contextid : str + Acceptor context identifier + sampleid : str + Sample name/identifier + contextchrom : str + Chromosome name the context is located on + contextorigin : int + Variant position the context is constructed from + contextstart : int + Leftmost genomic position of the acceptor context + contextend : int + Rightmost genomic position of the acceptor context + acceptorreads : list of DonorBamRead + Acceptor context reads + """ self.variant_acceptor_context = OverlapContext( contextid, sampleid, contextchrom, contextorigin, @@ -204,12 +447,29 @@ def add_acceptor_context(self, contextid, sampleid, acceptorreads ) - # Adds a donor context to the variant context by creating it from - # data values. def add_donor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, donorreads): + """Sets the donor context of the variant context by constructing one from the provided parameters. + + Parameters + ---------- + contextid : str + Donor context identifier + sampleid : str + Sample name/identifier + contextchrom : str + Chromosome name the context is located on + contextorigin : int + Variant genomic position the context is constructed from + contextstart : int + Leftmost genomic position of the donor context + contextend : int + Rightmost genomic position of the donor context + donorreads : list of DonorBamRead + Donor context reads + """ self.variant_donor_context = OverlapContext( contextid, sampleid, contextchrom, contextorigin, @@ -218,82 +478,189 @@ def add_donor_context(self, contextid, sampleid, ) # ===METHODS TO OBTAIN VARIANT CONTEXT UNMAPPED MATE READ DATA============= - # Returns the variant context acceptor read ids that have an - # unmapped mate. def get_unmapped_acceptor_read_ids(self): + """Returns the variant context acceptor read identifiers that have an unmapped mate. + + Returns + ------- + self.unmapped_acceptor_mate_ids : list of str + Variant context acceptor read identifiers with an unmapped mate + """ return self.unmapped_acceptor_mate_ids - # Returns the variant context donor read ids that have an unmapped - # mate. def get_unmapped_donor_read_ids(self): + """Returns the variant context donor read identifiers that have an unmapped mate. + + Returns + ------- + self.unmapped_donor_mate_ids : list of str + Variant context donor read identifiers with an unmapped mate + :return: + """ return self.unmapped_donor_mate_ids - # Adds a variant context appector mate identifier. def add_unmapped_acceptor_mate_id(self, mateid): + """Adds a variant context appector mate identifier. + + Parameters + ---------- + mateid : str + Variant context acceptor read identifier with an unmapped mate + """ self.unmapped_acceptor_mate_ids.append(mateid) - # Adds a variant context donor mate identifier. def add_unmapped_donor_mate_id(self, mateid): + """Adds a variant context donor mate identifier. + + Parameters + ---------- + mateid : str + Variant context donor read identifier with an unmapped mate + """ self.unmapped_donor_mate_ids.append(mateid) - # Sets the variant context unmapped acceptor mate ids. def set_unmapped_acceptor_mate_ids(self, mateids): + """Sets the variant context unmapped acceptor mate ids. + + Parameters + ---------- + mateids : list of str + Variant context acceptor read identifiers with unmapped mate + """ self.unmapped_acceptor_mate_ids = mateids - # Sets the variant context unmapped donor mate ids. def set_unmapped_donor_mate_ids(self, mateids): + """Sets the variant context unmapped donor mate ids. + + Parameters + ---------- + mateids : list of str + Variant context donor read identifiers with unmapped mates + """ self.unmapped_donor_mate_ids = mateids - # Returns whether a specified variant context acceptor read has an - # unmapped mate. def acceptor_read_has_unmapped_mate(self, readid): + """Returns whether a specified variant context acceptor read has an unmapped mate. + + Parameters + ---------- + readid : str + Acceptor read identifier + + Returns + ------- + bool + True if acceptor read has unmapped mate, False if not + """ return readid in self.unmapped_acceptor_mate_ids - # Returns whether a specified variant context donor read has an - # unmapped mate. def donor_read_has_unmapped_mate(self, readid): + """Checks and returns whether a specified variant context donor read has an unmapped mate. + + Parameters + ---------- + readid : str + Donor read identifier + + Returns + ------- + bool + True if donor read has unmapped mate, False if not + """ return readid in self.unmapped_donor_mate_ids - # Returns the number of variant context acceptor reads with an - # unmapped mate. def get_number_of_unmapped_acceptor_mates(self): + """Returns the number of variant context acceptor reads with an unmapped mate. + + Returns + ------- + int + Number of variant context acceptor reads with an unmapped mate. + """ return len(self.unmapped_acceptor_mate_ids) - # Returns the number of variant context donor reads with an - # unmapped mate. def get_number_of_unmapped_donor_mates(self): + """Returns the number of variant context donor reads with an unmapped mate. + + Returns + ------- + int + Number of variant context donor reads with an unmapped mate. + """ return len(self.unmapped_donor_mate_ids) # ===METHODS TO ADD UNMAPPED MATES TO THE ACCEPTOR AND DONOR CONTEXT======= - # Sets the unmapped mate ids for the acceptor context. def set_acceptor_context_unmapped_mates(self, mateids): + """Sets the unmapped mate ids for the acceptor context. + + Returns + ------- + mateids : list of str + Acceptor context read identifiers with an unmapped mate + """ self.variant_acceptor_context.set_unmapped_mate_ids(mateids) - # Adds an unmapped read id to the acceptor context. def add_acceptor_context_unmapped_mate(self, ureadid): + """Adds an unmapped read id to the acceptor context. + + Parameters + ---------- + ureadid : str + Acceptor context read identifier with an unmapped mate + """ self.variant_acceptor_context.add_unmapped_mate_id(ureadid) - # Sets the unmapped mate ids for the donor context. def set_donor_context_unmapped_mates(self, mateids): + """Sets the unmapped mate ids for the donor context. + + Parameters + ---------- + mateids : list of str + Donor context read identifiers with an unmapped mate + """ self.variant_donor_context.set_unmapped_mate_ids(mateids) - # Adds an unmapped read id to the donor context. - def add_donor_context_unmapped_mate_id(self, ureadid): - self.variant_donor_context.setUnmappedMateId(ureadid) + def add_donor_context_unmapped_mate(self, ureadid): + """Adds an unmapped read id to the donor context. + + Parameters + ---------- + ureadid : str + Donor context read identifier with an unmapped mate + """ + self.variant_donor_context.add_unmapped_mate_id(ureadid) # ===METHODS TO OBTAIN STATISTICS OF THE VARIANT CONTEXT=================== - # Returns the average and median quality of the acceptor reads - # associated with def get_average_and_median_acceptor_read_qual(self): + """Returns the average and median quality of the acceptor reads associated with. + + Returns + ------- + + """ return self.get_average_and_median_read_qual(self.variant_context_areads) - # Returns the average and median quality of the donor reads - # associated with this variant context. def get_average_and_median_donor_read_qual(self): + """Calculates and returns the mean and median variant context donor read Q-Score. + + Returns + ------- + list of int + Mean and median Q-Score + """ return self.get_average_and_median_read_qual(self.variant_context_dreads) - # Returns the average and median read quality. def get_average_and_median_read_qual(self, contextreads): + """Calculates and returns the mean and median read Q-Score. + + Parameters + ---------- + contextreads : list or reads + Reads to calculate mean and median Q-Score of + + Returns + ------- + """ if contextreads is not None: avgmedqual = [] for contextread in contextreads: @@ -302,18 +669,39 @@ def get_average_and_median_read_qual(self, contextreads): statistics.median(avgmedqual)]) return [None, None] - # Returns the average and median mapq values of the acceptor reads - # associated with this variant context. def get_average_and_median_acceptor_read_mapq(self): + """Calculates and returns the mean and median variant context acceptor read MAPQ value. + + Returns + ------- + list of int + Mean and median MAPQ, None if there are no acceptor reads + """ return self.get_average_and_median_read_mapq(self.variant_context_areads) - # Returns the average and median mapq values of the donor reads - # associated with this variant context. def get_average_and_median_donor_read_mapq(self): + """Calculates and returns the mean and median variant context donor read MAPQ value. + + Returns + ------- + list of int + Mean and median variant context donor read MAPQ, None if there are no donor reads + """ return self.get_average_and_median_read_mapq(self.variant_context_dreads) - # Returns the average and median read MapQ of this variant context. def get_average_and_median_read_mapq(self, contextreads): + """Calculates and returns the mean and median MAPQ value of provided reads. + + Parameters + ---------- + contextreads : list of DonorBamRead + Reads to calculate mean and median MAPQ of + + Returns + ------- + list of int + Mean and median read MAPQ, None if no reads provided + """ if contextreads is not None: avgmedmapq = [] for contextread in contextreads: @@ -322,18 +710,39 @@ def get_average_and_median_read_mapq(self, contextreads): statistics.median(avgmedmapq)]) return [None, None] - # Returns the average and median length of the acceptor reads - # associated with this variant context. def get_average_and_median_acceptor_read_length(self): + """Calculates and returns the mean and median variant context acceptor read length. + + Returns + ------- + list of int + Mean and median variant context acceptor read length, None if there are no acceptor reads + """ return self.get_average_and_median_read_length(self.variant_context_areads) - # Returns the average and median length of the donor reads - # associated with this variant context. def get_average_and_median_donor_read_length(self): + """Calculates and returns the mean and median variant context donor read length. + + Returns + ------- + list of int + Mean and median variant context donor read length, None if there are no donor reads + """ return self.get_average_and_median_read_length(self.variant_context_dreads) - # Returns the average and median read length. def get_average_and_median_read_length(self, contextreads): + """Calculates and returns the mean and median read length of a specified list of reads. + + Parameters + ---------- + contextreads : list of DonorBamReds + Reads to to calculate mean and median length of + + Returns + ------- + list of int + Mean and median read length, None if no reads are supplied + """ if contextreads is not None: avgmedlen = [] for contextread in contextreads: @@ -343,142 +752,335 @@ def get_average_and_median_read_length(self, contextreads): return [None, None] # ===METHODS TO OBTAIN ACCEPTOR CONTEXT DATA=============================== - # Returns whether the variant context has an acceptor context def has_acceptor_context(self): + """Returns whether the variant context has an acceptor context + + Returns + ------- + bool + True if variant context has an acceptor context, False if not + """ return self.variant_acceptor_context is not None - # Returns the acceptor context identifier (should be the same as the - # variant context id). def get_acceptor_context_id(self): + """Returns the acceptor context identifier. + + Returns + ------- + str + Acceptor context identifier + """ return self.variant_acceptor_context.get_context_id() - # Returns the acceptor context sample id (should be the same as the - # variant context sample id). def get_acceptor_context_sample_id(self): + """Returns the acceptor context sample id. + + Returns + ------- + str + Sample name/identifier of the acceptor context + """ return self.variant_acceptor_context.get_sample_id() - # Returns the chromosome of the acceptor context. def get_acceptor_context_chrom(self): + """Returns the chromosome name of the acceptor context. + + Returns + ------- + str + Chromosome name the acceptor context is located on + """ return self.variant_acceptor_context.get_context_chrom() - # Returns the origin position of the acceptor context. def get_acceptor_context_origin(self): + """Returns the origin position of the acceptor context. + + Returns + ------- + int + Variant genomic position the context is based on + """ return self.variant_acceptor_context.get_context_origin() - # Returns the starting position of the acceptor context. def get_acceptor_context_start(self): + """Returns the starting position of the acceptor context. + + Returns + ------- + int + Acceptor context leftmost genomic position + """ return self.variant_acceptor_context.get_context_start() - # Returns the ending position of the acceptor context. def get_acceptor_context_end(self): + """Returns the ending position of the acceptor context. + + Returns + ------- + int + Acceptor context rightmost genomic position + """ return self.variant_acceptor_context.get_context_end() - # Returns the length of the acceptor context. def get_acceptor_context_length(self): + """Returns the length of the acceptor context. + + Returns + ------- + int + Acceptor context length + """ return self.variant_acceptor_context.get_context_length() - # Returns the acceptor context reads. def get_acceptor_context_reads(self): + """Returns the acceptor context reads. + + Returns + ------- + list of DonorBamRead + Acceptor context reads + """ return self.variant_acceptor_context.get_context_bam_reads() - # Returns the lost of read ids in the acceptor context. def get_acceptor_context_read_ids(self): + """Returns the acceptor context read identifiers. + + Returns + ------- + list of str + Acceptor context read identifiers + """ return self.variant_acceptor_context.get_context_bam_read_ids() - # Returns a list of all acceptor context BAM read start positions. def get_acceptor_context_read_starts(self): + """Returns the leftmost genomic positions of the acceptor context reads + + Returns + ------- + """ return self.variant_acceptor_context.get_context_bam_read_starts() - # Returns a list of all acceptor context R1 BAM read left positons. def get_acceptor_context_read_left_positions(self): + """Returns the leftmost genomic positions of all R1 acceptor context reads + + Returns + ------- + list of int + Acceptor context leftmost genomic R1 read positions + """ return self.variant_acceptor_context.get_context_bam_read_left_positions() - # Returns a list of all acceptor context BAM read end positions. def get_acceptor_context_read_ends(self): + """Returns the rightmost genomic positions of all acceptor context reads. + + Returns + ------- + list of int + Acceptor context rightmost genomic read positions. + """ return self.variant_acceptor_context.get_context_bam_read_ends() - # Returns a list of all acceptor context R2 BAM read end positions. def get_acceptor_context_read_right_positions(self): + """Returns a list of all acceptor context R2 BAM read end positions. + + Returns + ------- + list of int + Acceptor context rightmost genomic R2 read positions + """ return self.variant_acceptor_context.get_context_bam_read_right_positions() - # Returns a list of all acceptor context BAM read lengths. def get_acceptor_context_read_lengths(self): + """Returns the lengths of the acceptor context reads. + + Returns + ------- + list of int + Acceptor context read lengths + """ return self.variant_acceptor_context.get_context_bam_read_lengths() - # Returns the list of acceptor context unmapped mate read ids def get_acceptor_context_unmapped_mate_ids(self): + """Returns the acceptor context read identifiers with unmapped mates. + + Returns + ------- + list of str + Acceptor context read identifiers with unmapped mates. + """ return self.variant_acceptor_context.get_unmapped_read_mate_ids() # ===METHODS TO OBTAIN DONOR CONTEXT DATA================================== - # Returns whether the variant context has a donor context def has_donor_context(self): + """Checks and returns whether the variant context has a donor context saved. + + Returns + ------- + bool + True if the variant context has a donor context, False if not + """ return self.variant_donor_context is not None - # Returns the donor context identifier (should be the same as the - # variant context id). def get_donor_context_id(self): + """Returns the donor context identifier. + + Returns + ------- + str + Donor context identifier + """ return self.variant_donor_context.get_context_id() - # Returns the acceptor context sample id (should be the same as the - # variant context sample id). def get_donor_context_sample_id(self): + """Returns the sample name/identifier of the donor context + + Returns + ------- + str + Donor context sample name/identifier + """ return self.variant_donor_context.get_sample_id() - # Returns the chromosome of the donor context. def get_donor_context_chrom(self): + """Returns the chromosome of the donor context. + + Returns + ------- + """ return self.variant_donor_context.get_context_chrom() - # Returns the origin position of the donor context. def get_donor_context_origin(self): + """Returns the origin position of the donor context. + + Returns + ------- + int + Variant genomic position that the context is constructed from. + """ return self.variant_donor_context.get_context_origin() - # Returns the starting position of the donor context. def get_donor_context_start(self): + """Returns the starting position of the donor context. + + Returns + ------- + int + Donor context leftmost genomic position + """ return self.variant_donor_context.get_context_start() - # Returns the ending position of the donor context. def get_donor_context_end(self): + """Returns the ending position of the donor context. + + Returns + ------- + int + Donor context rightmost genomic position + """ return self.variant_donor_context.get_context_end() - # Returns the length of the acceptor context. def get_donor_context_length(self): + """Returns the length of the donor context. + + Returns + ------- + int + Donor context length + """ return self.variant_donor_context.get_context_length() - # Returns the donor context reads. def get_donor_context_reads(self): + """Returns all donor context reads. + + Returns + ------- + list of DonorBamRead + Donor context reads + """ return self.variant_donor_context.get_context_bam_reads() - # Returns the donor context read identifers. def get_donor_context_read_ids(self): + """Returns the identifiers of all donor context reads. + + Returns + ------- + list of str + Donor context read identifiers + """ return self.variant_donor_context.get_context_bam_read_ids() - # Returns a list of donor context read start positions. def get_donor_context_read_starts(self): + """Returns the leftmost genomic read positions of all donor context reads. + + Returns + ------- + list of int + Donor context leftmost genomic read positions + """ return self.variant_donor_context.get_context_bam_read_starts() - # Returns a list of all acceptor context R1 BAM read left positons. def get_donor_context_read_left_positions(self): + """Returns the leftmost genomic positions of all R1 donor context reads. + + Returns + ------- + list of int + Donor context leftmost genomic R1 read positions + """ return self.variant_donor_context.get_context_bam_read_left_positions() - # Returns a list of donor context read end positions. def get_donor_context_read_ends(self): + """Returns the rightmost genomic positions of all donor context reads. + + Returns + ------- + list of int + Donor context rightmost genomic read positions + """ return self.variant_donor_context.get_context_bam_read_ends() - # Returns a list of all acceptor context R2 BAM read right positons. def get_donor_context_read_right_positions(self): + """Returns the rightmost genomic positions of all R2 donor context reads + + Returns + ------- + list of int + Donor context rightmost genomic R2 read positions + """ return self.variant_donor_context.get_context_bam_read_right_positions() - # Returns a list of donor context BAM read lengths. def get_donor_context_read_lengths(self): + """Returns the lengths of the donor context reads. + + Returns + ------- + list of int + Donor context read lengths + """ return self.variant_donor_context.get_context_bam_read_lengths() - # Returns the list of donor context unmapped mate read ids. def get_donor_context_unmapped_mate_ids(self): + """Returns the donor context read identifiers that have unmapped mates. + + Returns + ------- + list of str + Donor context read identifiers + """ return self.variant_donor_context.get_unmapped_read_mate_ids() # ===METHODS TO PRODUCE SOME OUTPUT ABOUT THE VARIANT CONTEXT============== - # Returns a varcon.txt string representation of the variant context. def to_string(self): + """Creates and returns the variant context as a String representation. + + The created String representation of the variant context is equal to the entry of a variant context file. Each + included data attribute is separated by a tab. + + Returns + ------- + str + Variant context as variant context file entry + """ if self.variant_context_areads is None: ad_ratio = "N/A" list_areads = None @@ -509,9 +1111,14 @@ def to_string(self): + str(list_areads) + "\t" + str(list_dreads)) - # Returns a varconstats.txt string representation of the variant - # context. def to_statistics_string(self): + """Returns a String with basic variant context statistics. + + Returns + ------- + str + Basic tab separated variant context statistics + """ areadlen = self.get_average_and_median_acceptor_read_length() dreadlen = self.get_average_and_median_donor_read_length() areadqual = self.get_average_and_median_acceptor_read_qual() diff --git a/VariantContextFile.py b/VariantContextFile.py index 9052e8d..ea90b38 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -5,6 +5,21 @@ class VariantContextFile: + """The VariantContextFile saves variant contexts + + Attributes + ------- + vaselogger + The logger displaying information of activities + variant_context_file_location : str + THe location of a read variant context file + variant_contexts : dict + Saves variant contexts by context identifier + variant_context_statistics + varcon_fields : dict + Number representation of each variant context data field + """ + def __init__(self, fileloc=None, samplefilter=None, varconfilter=None, chromfilter=None): self.vaselogger = logging.getLogger("VaSe_Logger") @@ -32,48 +47,126 @@ def __init__(self, fileloc=None, samplefilter=None, # ===METHODS TO GET DATA FROM THE VARIANT CONTEXT FILE===================== def get_variant_contexts(self, asdict=False): + """Returns the variant contexts. + + By default all variant contexts are gathered in a list and returned. If the asdict parameter is set t True, the + variant contexts are returned as a dictionary. + + Returns + ------- + list or dict + Variant context as list by default, as dict when parameter is set to True + """ if asdict: return self.variant_contexts return [varcon for varcon in self.variant_contexts.values()] - # Returns the variant contexts by sample identifier def get_variant_contexts_by_sampleid(self): + """Gathers variant contexts, sorts them by sample identifiers and returns them. + + Returns + ------- + dict + + """ varcons = self.get_variant_contexts() return {x.get_variant_context_sample(): [y for y in varcons if y.get_variant_context_sample() == x.get_variant_context_sample()] for x in varcons} - # Returns the number of contexts saved def get_number_of_contexts(self): + """Determines and returns the number of variant contexts. + + Returns + ------- + int + Number of variant contexts + """ return len(self.variant_contexts) - # Returns a list of context ids def get_variant_context_ids(self): + """Returns the variant context identifiers. + + Returns + ------- + list of str + Context identifiers of all variant contexts + """ return [x for x in self.variant_contexts.keys()] - # Returns a specified variant context. def get_variant_context(self, contextid): + """Checks whether a variant context exists and returns it if so. + + Parameters + ---------- + contextid : str + Identifier of context to obtain + + Returns + ------- + VariantContext or None + VariantContext if it exists, None if not + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid] return None - # Returns whether a variant context is present def has_variant_context(self, contextid): + """Checks and returns whether a specified variant context has a variant context. + + Parameters + ---------- + contextid : str + Context identifier to check + + Returns + ------- + bool + True if the variant context is present, False if not + """ return contextid in self.variant_contexts - # Returns the acceptor context of the specified variant context. def get_acceptor_context(self, contextid): + """Fetches and returns a specified acceptor context. + + Parameters + ---------- + contextid : str + Context identifier of acceptor context to return + + Returns + ------- + OverlapContext or None + The acceptor context if it exists, None if not + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_context() return None - # Returns the donor context of the specified variant context. def get_donor_context(self, contextid): + """Fetches and returns a specified donor context. + + Parameters + ---------- + contextid : str + Context identifier of donor context to return + + Returns + ------- + OverlapContext or None + The donor context if it exists, None if not + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_context() return None - # Returns all variant context acceptor reads. def get_all_variant_context_acceptor_reads(self): + """Gathers and returns the acceptor reads of all variant contexts + + Returns + ------- + list of DonorBamRead + Acceptor reads of all variant contexts + """ acceptorreads = [] for varcon in self.variant_contexts.values(): acceptorreads.extend(varcon.get_acceptor_reads()) @@ -95,6 +188,13 @@ def get_all_variant_context_acceptor_reads(self): # return uniqdonorreads # ============================================================================= def get_all_variant_context_donor_reads(self): + """Collects and returns all variant context donor reads. + + Returns + ------- + list of DonorBamRead + Variant context donor reads + """ dbrs = [] donorreads = [] for varcon in self.variant_contexts.values(): @@ -106,55 +206,133 @@ def get_all_variant_context_donor_reads(self): dbr.get_bam_read_qual())) return list(set(donorreads)) - # Returns all variant context acceptor read ids. def get_all_variant_context_acceptor_read_ids(self): + """Returns all variant context acceptor read identifiers. + + Returns + ------- + acceptorreadids : list of str + Variant context acceptor read identifiers + """ acceptorreadids = [] for varcon in self.variant_contexts.values(): acceptorreadids.extend(varcon.get_acceptor_read_ids()) return acceptorreadids - # Returns the variant context donor read ids. def get_all_variant_context_donor_read_ids(self): + """Returns all variant context donor read identifiers. + + Returns + ------- + donorreadids : list of str + Variant context donor read identifiers + """ donorreadids = [] for varcon in self.variant_contexts.values(): donorreadids.extend(varcon.get_donor_read_ids()) return donorreadids - # Returns the variant context field data def get_variant_context_fields(self): + """Returns the number representations of the variant context data. + + Returns + ------- + self.varcon_fields : dict + Number representation of each variant context field + """ return self.varcon_fields # ===METHODS TO OBTAIN VARIANT CONTEXT DATA============================= def get_variant_context_areads(self, contextid): + """Returns the variant context acceptor reads for a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier + + Returns + ------- + list of DonorBamRead + Variant context acceptor reads, empty list of context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_reads() return [] - # Returns the donor reads of a specified variant context def get_variant_context_dreads(self, contextid): + """Returns the variant context donor reads for a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier + + Returns + ------- + list of DonorBamRead + Variant context donor reads, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_reads() return [] - # Returns the acceptor context reads def get_acceptor_context_reads(self, contextid): + """Returns the reads of a specified acceptor context. + + Parameters + ---------- + contextid : str + Acceptor context identifier + + Returns + ------- + list of DonorBamRead + Acceptor context reads + """ if contextid in self.variant_contexts: if self.variant_contexts[contextid].has_acceptor_context(): return self.variant_contexts[contextid].get_acceptor_context_reads() return [] - # Returns the reads of the donor context def get_donor_context_reads(self, contextid): + """Returns the reads of a specified donor context. + + Parameters + ---------- + contextid : str + Donor context identifier + + Returns + ------- + list of DonorBamRead + Donor context reads, empty list if context does not exist + """ if contextid in self.variant_contexts: if self.variant_contexts[contextid].has_donor_context(): return self.variant_contexts[contextid].get_donor_context_reads() return [] # ===BASIC VARIANTCONTEXTFILE METHODS====================================== - # Reads a provided variant context file and saves data according to - # set filters. def read_variant_context_file(self, fileloc, samplefilter=None, - IDfilter=None, chromfilter=None): + idfilter=None, chromfilter=None): + """Reads a provided variant context file and saves the data. + + Filter can be set for reading the variant context file. The sample filter can nbe used to specify which samples + to save. Samples not in the samplefilter will be skipped. Similarly filter for variant contexts and chromosome + names can be set. + + Parameters + ---------- + fileloc : str + Path to variant context file to read + samplefilter : list of str + Sample names/identifiers to include + idfilter : list of str + Variant contexts to include + chromfilter : list of str + Chromosome names to include + """ try: with open(fileloc, "r") as vcfile: varcon_records = vcfile.readlines() @@ -168,7 +346,7 @@ def read_variant_context_file(self, fileloc, samplefilter=None, if record[0] in self.variant_contexts: continue - IDpass = self.passes_filter(record[0], IDfilter) + IDpass = self.passes_filter(record[0], idfilter) samplepass = self.passes_filter(record[1], samplefilter) chrompass = self.passes_filter(record[2], chromfilter) if not (IDpass and samplepass and chrompass): @@ -179,8 +357,20 @@ def read_variant_context_file(self, fileloc, samplefilter=None, new_varcon = VariantContext(*record[:6], acceptor_reads, donor_reads) self.variant_contexts[record[0]] = new_varcon - # Reads an acceptor context file def read_acceptor_context_file(self, accconfileloc, samplefilter=None, contextfilter=None, chromfilter=None): + """Reads a provided acceptor context file and adds the acceptor contexts to already existing variant contexts. + + Parameters + ---------- + accconfileloc : str + Path to the acceptor contexts file + samplefilter : list of str + Sample names/identifiers to include + contextfilter : list of str + Acceptor contexts to include + chromfilter : list of str + Chromosome names to include + """ try: with open(accconfileloc, "r") as accconfile: next(accconfile) # Skip the header line @@ -205,6 +395,19 @@ def read_acceptor_context_file(self, accconfileloc, samplefilter=None, contextfi # Reads a donor context file def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilter=None, chromfilter=None): + """Reads a provided donor contexts file and adds the donor contexts to already existing variant contexts. + + Parameters + ---------- + donconfileloc : str + Path to the donr contexts file + samplefilter : list of str + Sample names/identifiers to include + contextfilter : list of str + Donor contexts to include + chromfilter : list of str + Chromosome names to include + """ try: with open(donconfileloc, "r") as donconfile: next(donconfile) # Skip the header line @@ -227,45 +430,121 @@ def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilte except IOError: self.vaselogger.warning(f"Could not read donor context file: {donconfileloc}") - # Returns whether something is in the filter or not. def passes_filter(self, valtocheck, filterlist): + """Tests and returns whether a provided value is in a provided filter. + + Filters are expected to be inclusion filters, denoting a list of values that should be used/included. + + Parameters + ---------- + valtocheck : str + Value to check against a filter list + filterlist : list of str + Values to filter with + + Returns + ------- + bool + True if value is in filter, False otherwise + """ if filterlist is not None: return valtocheck in filterlist return True # ===VARIANT CONTEXT SET OPERATIONS (UNION, INTERSECT, DIFFERENCE)========= - # Returns the list of variant context identifiers from both variant - # context files. def get_variant_contexts_union(self, other_varcon_file): + """Returns all variant context identifiers from both variant context files. + + Parameters + ---------- + other_varcon_file : VariantContextFile + A VariantContextFile object with VariantContexts + + Returns + ------- + list of str + All variant context identifiers of both VariantContextFile objects + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) | set(other_varcon_ids)) - # Returns the list of variant context identifiers present in both - # variant context files. def get_variant_contexts_intersect(self, other_varcon_file): + """Returns variant context identifiers present in both variant context files. + + Parameters + ---------- + other_varcon_file : VariantContextFile + VariantContextFile with VariantContext objects + + Returns + ------- + list of str + Shared variant context identifiers + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) & set(other_varcon_ids)) - # Returns the list of variant context identifiers in this file but - # not present in the other variant context file. def get_variant_contexts_difference(self, other_varcon_file): + """Returns the variant context identifiers not present in the other variant context file. + + Parameters + ---------- + other_varcon_file : VariantContextFile + VariantContextFile with variant contexts + + Returns + ------- + list of str + Variant context identifiers not present in the other variant context file + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) - set(other_varcon_ids)) - # Returns the list of variant context identifiers only present in - # one of the variant context file but not the other. def get_variant_contexts_symmetric_difference(self, other_varcon_file): + """Determines the variant context identifiers present in either one or the other variant context file. + + THe normal difference only returns the context identifiers in A but not in B. The symmetric difference returns + the context identifiers in A but not in B as well as context identifiers in B but not in A. + + Parameters + ---------- + other_varcon_file : VariantContextFile + + Returns + ------- + list of str + Variant context read identifiers + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) ^ set(other_varcon_ids)) - # ===METHODS TO OBTAIN VARIANT CONTEXT DATA BASED ON FILTERS=============== - # Returns a list/hashmap of VariantContextObjects. + # ===METHODS TO OBTAIN VARIANT CONTEXT DATA BASED ON FILTERS===============. def get_variant_contexts2(self, aslist=False, varconfilter=None, samplefilter=None, chromfilter=None): + """Returns the variant contexts in the VariantContextFile. + + Inclusion filters can be set to subset the returned variant contexts. FIlters include sample names/identifiers, + variant context identifiers and chromosome names. Variant contexts can be returned as list or dictionary. + + Parameters + ---------- + aslist : bool + Return variant context in a list + varconfilter : list of str + Contexts to include + samplefilter : list of str + Samples to include + chromfilter : list of str + + Returns + ------- + list or dict of VariantContext + Returns variant contexts as list if aslist set to True, dict otherwise + """ if aslist: return [ x for x in self.variant_contexts.values() @@ -286,9 +565,30 @@ def get_variant_contexts2(self, aslist=False, varconfilter=None, } # ====METHODS TO ASSESS WHETHER A VARIANT IS IN AN EXISTING CONTEXT======== - # Main method that returns whether a variant (SNP or indel). def variant_is_in_context(self, varianttype, searchchrom, searchstart, searchstop): + """Returns whether a variant is located in an already existing context. + + If the variant type is set to 'snp' the method will check and return whether the SNP overlaps with a variant + context. If the variant is set to 'indel' the method will check and return wheterh the area from start to end + of the indel overlaps with a variant context. + + Parameters + ---------- + varianttype : str + Type of variant (snp/indel) + searchchrom : str + Chromosome name the variant is located on + searchstart : int + Leftmost genomic search window position + searchstop : int + Rightmost genomic search window position + + Returns + ------- + bool or None + True if the variant overlaps with a context, False if not + """ if varianttype == "snp": return self.snp_variant_is_in_context(searchchrom, searchstart) if varianttype == "indel": @@ -296,9 +596,25 @@ def variant_is_in_context(self, varianttype, searchchrom, searchstop) return None - # Determines whether an SNP variant is located in an already - # existing variant context. + # Determines whether an SNP variant is located in an already existing variant context. def snp_variant_is_in_context(self, varchrom, vcfvarpos): + """Checks and returns whether a SNP is located in an already existing variant context. + + For each variant context it is first checked whether the chromosome name of the context and SNP are the same. If + so, it is then checked whether the SNP genomic position is equal or larger than the context start and smaller or + equal than the context end. + + Parameters + ---------- + varchrom : str + Chromosome name the SNP is located on + vcfvarpos : int + Genomic position of the SNP + Returns + ------- + bool + True if the SNP is in a variant context, False if not + """ for varcon in self.variant_contexts.values(): if varchrom == varcon.get_variant_context_chrom(): if (vcfvarpos >= varcon.get_variant_context_start() @@ -306,10 +622,26 @@ def snp_variant_is_in_context(self, varchrom, vcfvarpos): return True return False - # Determines whether an indel variant is located within an existing - # variant context (indelLeftPos and indelRightPos can be the used - # search window). def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos): + """Checks and returns whether an indel is located in an already existing variant context. + + For each variant context it is first checked whether the chromosome name of the context and indel are the same. + If so, it is than checked whether the indel range from indel start to end overlaps with the context. + + :Parameters + ----------- + indelchrom : str + The chromosome name the indel is located on + indelleftpos : int + The leftmost genomic position of the indel + indelrightpos : int + The rightmost genomic position of the indel + + Returns + ------- + bool + True if the indel overlaps with a variant context, False if not + """ for varcon in self.variant_contexts.values(): if (indelchrom == varcon.get_variant_context_chrom()): if (indelleftpos <= varcon.get_variant_context_start() @@ -323,8 +655,19 @@ def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos): return True return False - # Checks whether a context is located in an already existing context def context_collision(self, context_arr): + """Checks and returns whether a potential context overlaps with an already existing variant context. + + Parameters + ---------- + context_arr: list + Essential context data (chrom, variant pos, start, end) + + Returns + ------- + bool + True if provided context overlaps with an already existing context, False if not + """ if f"{context_arr[0]}_{context_arr[1]}" in self.variant_contexts: return True else: @@ -336,16 +679,49 @@ def context_collision(self, context_arr): return False # ===METHODS TO ADD DATA/VARIANT CONTEXTS TO THE VARIANT CONTEXT FILE====== - # Sets a variant context object to a specified context id. def set_variant_context(self, varconid, varcontext): + """Sets a provided variant context to the provided context identifier. Will overwrite the previous value for + the provided context identifier. + + Parameters + ---------- + varconid : str + Identifier of the variant context to add + varcontext: VariantContext + The VariantContext to set + """ self.variant_contexts[varconid] = varcontext - # Adds a variant context object def add_variant_context(self, varconid, sampleid, varconchrom, varconorigin, varconstart, varconend, varcon_areads, varcon_dreads, acceptor_context=None, donor_context=None): + """Constructs a VariantContext from the provided parameters and adds it. + + Parameters + ---------- + varconid : str + Identifier of the variant context + sampleid : str + Identifier of the sample + varconchrom : str + Chromosome name of the variant context + varconorigin : int + The variant position the context is based on + varconstart : int + Leftmost genomic position of the variant context + varconend : int + Rightmost genomic position of the variant context + varcon_areads : list of DonorBamRead + Variant context acceptor reads + varcon_dreads : list of DonorBamRead + Variant contect donor reads + acceptor_context : OverlapContext + Acceptor context belonging to the variant context + donor_context : OverlapContext + Donor context belonging to the variant context + """ varcon_obj = VariantContext(varconid, sampleid, varconchrom, varconorigin, varconstart, varconend, @@ -353,17 +729,58 @@ def add_variant_context(self, varconid, sampleid, acceptor_context, donor_context) self.variant_contexts[varconid] = varcon_obj - # Adds an acceptor context object to a variant context. + def add_existing_variant_context(self, varconid, varconobj): + """Add an already created variant context to the variant context file. + + The variant context to add should be a valid VariantContext object. It will be added to the variant context + file under the context identifier. + + Parameters + ---------- + varconid : str + Identifier of the context + varconobj : VariantContext + The created variant context + """ + if varconobj is not None: + self.variant_contexts[varconid] = varconobj + def set_acceptor_context(self, varconid, acceptor_context): + """Adds an existing acceptor context to a variant context. + + Parameters + ---------- + varconid : str + Context identifier + acceptor_context : OverlapContext + The acceptor context to add + """ if varconid in self.variant_contexts: self.variant_contexts[varconid].set_acceptor_context(acceptor_context) - # Add a newly created acceptor context to an existing variant - # context. def add_acceptor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, acceptorreads): + """Constructs and adds an acceptor context to a specified variant context. + + Parameters + ---------- + contextid : str + Context identifier + sampleid: str + Sample name/identifier + contextchrom: str + Chromosome name the context is located on + contextorigin: int + Variant genomic position the context is based on + contextstart: int + Leftmost genomic position of the context + contextend: int + Rightmost genomic position of the context + acceptorreads: list of DonorBamRead + List of reads associated with the context + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].add_acceptor_context( contextid, sampleid, @@ -371,16 +788,42 @@ def add_acceptor_context(self, contextid, sampleid, contextstart, contextend, acceptorreads) - # Sets the donor context object of a variant context def set_donor_context(self, varconid, donor_context): + """Adds an already existing donor context to a specified variant context. + + Parameters + ---------- + varconid : str + Context identifier + donor_context : OverlapContext + Donor context to add + """ if varconid in self.variant_contexts: self.variant_contexts[varconid].set_donor_context(donor_context) - # Add a newly created donor context to an existing variant context. def add_donor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, donorreads): + """Constructs a donor context from the provided parameters and adds it to a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier + sampleid : str + Sample name/identifier + contextchrom : str + Chromosome name the context is located on + contextorigin : int + Variant position that constructed the context + contextstart : int + Donor context leftmost genomic position + contextend : int + Donor context rightmost genomic position + donorreads : list of DonorBamRead + Donor context reads + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].add_donor_context( contextid, sampleid, @@ -388,95 +831,221 @@ def add_donor_context(self, contextid, sampleid, contextstart, contextend, donorreads) - # ===METHODS TO ADD UNMAPPED MATE IDS TO ACCEPTOR, DONOR AND VARIANT======= - # ===CONTEXT=== - # Sets the unmapped mate read id for a specified acceptor context. + # ===METHODS TO ADD UNMAPPED MATE IDS TO ACCEPTOR, DONOR AND VARIANT CONTEXT======= def set_acceptor_context_unmapped_mate_ids(self, contextid, mateids): + """Sets the read identifiers that have unmapped read mates fo a specified acceptor context. + + Parameters + ---------- + contextid : str + Acceptor context to set unmapped read identifiers for + mateids : lit of str + Read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_acceptor_context_unmapped_mates(mateids) - # Sets the unmapped mate read id for a specified donor context. def set_donor_context_unmapped_mate_ids(self, contextid, mateids): + """Sets the read identifiers that have unmapped read mates for a specified donor context. + + Parameters + ---------- + contextid : str + Donor context to set unmapped read identifiers for + mateids : list of str + Donor context read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_donor_context_unmapped_mates(mateids) - # Sets the acceptor unmapped mate ids for a specified variant - # context. def set_unmapped_acceptor_mate_ids(self, contextid, mateids): + """Sets the acceptor read identifiers that have unmapped mates for a specified variant context. + + Parameters + ---------- + contextid : str + Variant contexct identifier to set unmapped acceptor read identifers for + mateids : list of str + Variant context acceptor read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_unmapped_acceptor_mate_ids(mateids) - # Sets the donor unmapped mate ids for a specified variant context. def set_unmapped_donor_mate_ids(self, contextid, mateids): + """Sets the donor read identifiers that have unmapped mates for a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier to set unmapped donor read identifiers for + mateids: list of str + Variant context donor read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_unmapped_donor_mate_ids(mateids) - # ===METHODS TO GET UNMAPPED MATE IDS OF AN ACCEPTOR, DONOR AND============ - # ===VARIANT CONTEXT=== - # Return the unmapped read mate identifiers of a specified acceptor - # context. + # ===METHODS TO GET UNMAPPED MATE IDS OF AN ACCEPTOR, DONOR AND VARIANT CONTEXT============ def get_acceptor_context_unmapped_mate_ids(self, contextid): + """Returns the acceptor context read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + Acceptor context identifier + + Returns + ------- + list of str + Acceptor context read identifiers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_context_unmapped_mate_ids() return [] - # Returns the unmapped read mate identifiers of a specified donor - # context. def get_donor_context_unmapped_mate_ids(self, contextid): + """Returns the donor context read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + Donor context identifier + + Returns + ------- + list of str + Donor contexts read identifiers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_context_unmapped_mate_ids() return [] - # Returns the unmapped acceptor read mate identifiers of a specified - # variant context. def get_unmapped_acceptor_mate_ids(self, contextid): + """Returns the variant context acceptor read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + Variant context identifier + + Returns + ------- + list of str + Variant context acceptor read identiifers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_unmapped_acceptor_read_ids() return [] - # Returns the unmapped donor read mate identifiers of a specified - # variant context. def get_unmapped_donor_mate_ids(self, contextid): + """Returns the variant context donor read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + + Returns + ------- + list of str + Variant context donor read identifiers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_unmapped_donor_read_ids() return [] # ===METHODS TO OBTAIN SOME STATISTICS ABOUT ALL THE CONTEXTS============== - # Returns the average variant context length within this variant - # context file. def get_average_variant_context_length(self): + """Calculates and returns the mean variant context length. + + The mean length is calculated over all variant contexts in the variant context file. Acceptor and donor contexts + associated with the variant contexts are not used. + + Returns + ------- + float + Average variant context length + """ return statistics.mean([varcon.get_variant_context_length() for varcon in self.variant_contexts.values()]) - # Returns the median variant context length within this variant - # context file. def get_median_variant_context_length(self): + """Calculates and returns the median variant context length. + + The median length is calculated over all variant contexts in the variant context file. Acceptor and donor + contexts associated with the variant contexts are not used. + + Returns + ------- + float + Median variant context length + """ return statistics.median([varcon.get_variant_context_length() for varcon in self.variant_contexts.values()]) - # Returns the average number of variant context acceptor reads def get_average_variant_context_acceptor_reads(self): + """Calculates and returns the average number of variant context acceptor reads. + + The mean number of acceptor reads is calculated over all variant contexts. + + Returns + ------- + float + Mean number of variant context acceptor reads + """ return statistics.mean([varcon.get_number_of_acceptor_reads() for varcon in self.variant_contexts.values()]) - # Returns the average number of variant context donor reads def get_average_variant_context_donor_reads(self): + """Calculates and returns the average number of variant context donor reads. + + The mean number of donor reads is calculated over all variant contexts. + + Returns + ------- + float + Mean number of variant context donor reads + """ return statistics.mean([varcon.get_number_of_donor_reads() for varcon in self.variant_contexts.values()]) - # Returns the median number of variant context acceptor reads def get_median_variant_context_acceptor_reads(self): + """Calculates and returns the median variant context acceptor reads. + + Returns + ------- + float + Median number of variant context acceptor reads + """ return statistics.median([varcon.get_number_of_acceptor_reads() for varcon in self.variant_contexts.values()]) - # Returns the median number of variant context donor reads def get_median_variant_context_donor_reads(self): + """Calculates and returns the median number of variant context donor reads. + + Returns + ------- + float + Median number of variant context donor reads + """ return statistics.median([varcon.get_number_of_donor_reads() for varcon in self.variant_contexts.values()]) # ===METHODS TO WRITE VARIANT CONTEXT DATA TO A FILE======================= - # Writes the variant context data to an output file. - def write_variant_context_file(self, outfileloc, samplefilter=None, + def write_variant_context_file(self, outfileloc, vbuuid, samplefilter=None, varconfilter=None, chromfilter=None): + """Writes the data saved in the variant contexts to an output file. + + Parameters + ---------- + outfileloc : str + Path to write the variant context file to + vbuuid : str + VaSeBuilder unique identifier that build the VariantContextFile + samplefilter : list of str + Sample names/identifiers to include + varconfilter : list of str + Variant contexts to include + chromfilter : list of str + Chromosome names to include + """ try: - with open(outfileloc, "w") as varcon_outfile: + varcon_outfile.write(f"#VBUUID: {vbuuid}\n") varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" "Start\tEnd\tAcceptorContextLength\t" "DonorContextLength\tAcceptorReads\t" @@ -501,13 +1070,26 @@ def write_variant_context_file(self, outfileloc, samplefilter=None, self.vaselogger.warning("Could not write variant contexts to " f"{ioe.filename}") - # Writes the donor contexts used to construct the variant contexts. - def write_acceptor_context_file(self, outfileloc, samplefilter=None, + def write_acceptor_context_file(self, outfileloc, vbuuid, samplefilter=None, contextfilter=None, chromfilter=None): + """Writes the acceptor contexts associated with variants contexts to an output file. + + Parameters + ---------- + outfileloc: str + Path and name to write acceptor contexts to + samplefilter: list of str + Sample names/identifiers to include + contextfilter: list of str + Acceptor contexts to include + chromfilter: list of str + Chromosome names to include + """ try: - with open(outfileloc, "w") as varcon_oufile: - varcon_oufile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" - "Start\tEnd\tNumOfReads\tReadIds\n") + with open(outfileloc, "w") as varcon_outfile: + varcon_outfile.write(f"#VBUUID: {vbuuid}\n") + varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" + "Start\tEnd\tNumOfReads\tReadIds\n") for varcon in self.variant_contexts.values(): samplepass = self.passes_filter( varcon.get_variant_context_sample(), @@ -522,19 +1104,29 @@ def write_acceptor_context_file(self, outfileloc, samplefilter=None, chromfilter ) if samplepass and varconpass and chrompass: - varcon_oufile.write( - varcon.get_acceptor_context().to_string() + "\n" - ) + varcon_outfile.write(varcon.get_acceptor_context().to_string() + "\n") except IOError as ioe: self.vaselogger.warning("Could not write acceptor contexts to " f"{ioe.filename}") - # Writes the acceptor cotnexts used to construct the variant - # contexts. - def write_donor_context_file(self, outfileloc, samplefilter=None, + def write_donor_context_file(self, outfileloc, vbuuid, samplefilter=None, contextfilter=None, chromfilter=None): + """Writes the donor contexts associated with variant context to an output file. + + Parameters + ---------- + outfileloc : str + Path and name to write the donor contexts to + samplefilter : list of str + Sample names/identifiers to include + contextfilter : list of str + Donor contexts to include + chromfilter : list of str + Chromosome names to include + """ try: with open(outfileloc, "w") as varcon_outfile: + varcon_outfile.write(f"#VBUUID: {vbuuid}\n") varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" "Start\tEnd\tNumOfReads\tReadIds\n") for varcon in self.variant_contexts.values(): @@ -558,11 +1150,19 @@ def write_donor_context_file(self, outfileloc, samplefilter=None, self.vaselogger.warning("Could not write donor contexts to " f"{ioe.filename}") - # Writes some statistics about the acceptor and donor reads - # identified for each variant context. - def write_variant_context_stats(self, statsoutloc): + def write_variant_context_stats(self, statsoutloc, vbuuid): + """Writes basic statistics of the variant contexts to a specified output file. + + Basic statistics include means and medians of read lengths, Q-Score and mapping quality. + + Parameters + ---------- + statsoutloc : str + Path and name to write variant context statistics output file to + """ try: with open(statsoutloc, "w") as varcon_statsfile: + varcon_statsfile.write(f"#VBUUID: {vbuuid}\n") varcon_statsfile.write("#ContextId\tAvg_ALen\tAvg_DLen\t" "Med_ALen\tMed_DLen\tAvg_AQual\t" "Avg_DQual\tMed_AQual\tMed_DQual\t" @@ -574,11 +1174,19 @@ def write_variant_context_stats(self, statsoutloc): self.vaselogger.critical("Could not write variant context " f"statistics to {statsoutloc}") - # Writes some statistics about the acceptor and donor reads - # identified for each variant context. - def write_acceptor_context_stats(self, statsoutloc): + def write_acceptor_context_stats(self, statsoutloc, vbuuid): + """Writes basic statistics of the acceptor contexts to a specified output file. + + Basic statistics include means and medians of read lengths, Q-Score and mapping quality. + + Parameters + ---------- + statsoutloc: str + Path and name to write the acceptor context statistics file to + """ try: with open(statsoutloc, "w") as varcon_statsfile: + varcon_statsfile.write(f"#VBUUID: {vbuuid}\n") varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" "Avg_ReadQual\tMed_ReadQual\t" "Avg_ReadMapQ\tMed_ReadMapQ\n") @@ -591,11 +1199,20 @@ def write_acceptor_context_stats(self, statsoutloc): self.vaselogger.critical("Could not write acceptor context " f"statistics to {statsoutloc}") - # Writes some statistics about the acceptor and donor reads - # identified for each variant context. - def write_donor_context_stats(self, statsoutloc): + # Writes some statistics about the acceptor and donor reads identified for each variant context. + def write_donor_context_stats(self, statsoutloc, vbuuid): + """Writes basic statistics of the donor contexts to a specified output file. + + Basic statistics include means and medians of read lengths, Q-Score and mapping quality. + + Parameters + ---------- + statsoutloc : str + Path and name to write the donor context statistics to + """ try: with open(statsoutloc, "w") as varcon_statsfile: + varcon_statsfile.write(f"#VBUUID: {vbuuid}\n") varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" "Avg_ReadQual\tMed_ReadQual\t" "Avg_ReadMapQ\tMed_ReadMapQ\n") @@ -608,11 +1225,21 @@ def write_donor_context_stats(self, statsoutloc): self.vaselogger.critical("Coud not write donor context statistics " f"to {statsoutloc}") - # Writes the left and right positions to the output file. Left pos - # for R1 and right pos for R2. - def write_left_right_positions(self, typetowrite, outfileloc): + def write_left_right_positions(self, typetowrite, outfileloc, vbuuid): + """Writes variant context leftmost and rightmost genomic read positions to an output file. + + The leftmost genomic positions of R1 reads and rightmost genomic positions for R2 reads are written to file. + + Parameters + ---------- + typetowrite : str + Type (acceptor/donor) of file to write + outfileloc : str + Path and name to write the output file to + """ try: with open(outfileloc, "w") as lrpof: + lrpof.write(f"#VBUUID: {vbuuid}") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions, rightpositions = [], [] @@ -641,11 +1268,19 @@ def write_left_right_positions(self, typetowrite, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - # Writes the acceptor context left and right positions to the output - # file. Left pos for R1 and right pos for R2. - def write_acceptor_left_right_positions(self, outfileloc): + def write_acceptor_left_right_positions(self, outfileloc, vbuuid): + """Writes acceptor context leftmost and rightmost genomic read positions to an output file. + + The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file. + + Parameters + ---------- + outfileloc : str + Path and name to write the output file to + """ try: with open(outfileloc, "w") as lrpof: + lrpof.write(f"#VBUUID: {vbuuid}\n") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions = [ @@ -663,11 +1298,19 @@ def write_acceptor_left_right_positions(self, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - # Writes the left and right positions to the output file. Left pos - # for R1 and right pos for R2. - def write_donor_left_right_positions(self, outfileloc): + def write_donor_left_right_positions(self, outfileloc, vbuuid): + """Writes donor context leftmost and rightmost genomic read positions to an output file. + + The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file. + + Parameters + ---------- + outfileloc : str + Path and name to write the output file to + """ try: with open(outfileloc, "w") as lrpof: + lrpof.write(f"#VBUUID: {vbuuid}\n") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions = [ @@ -681,70 +1324,121 @@ def write_donor_left_right_positions(self, outfileloc): lrpof.write(str(varcon.get_variant_context_id()) + "\t" + ",".join(leftpositions) + "\t" + ",".join(rightpositions) + "\n") - except IOError as ioe: + except IOError: self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") # Writes the identifiers of reads that have unmapped mates per # sample to a file. Samples are all donors and the ?template?. - def write_reads_with_unmapped_mate(self, typetowrite, umfileloc): + def write_reads_with_unmapped_mate(self, typetowrite, umfileloc, vbuuid): + """Writes variant context read identifiers with unmapped mates to a specified output file. + + Parameters + ---------- + typetowrite : str + Write variant context acceptor or donor read identifiers + umfileloc : str + Path to write the output file to + """ try: - with open(umfileloc, "w") as umFile: - umFile.write("#ContextId\tSampleId\tReadIds\n") + with open(umfileloc, "w") as umfile: + umfile.write(f"#VBUUID: {vbuuid}\n") + umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): if typetowrite == "acceptor": - umFile.write( + umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" + ";".join(varcon.get_unmapped_acceptor_mate_ids()) - ) + + "\n") if typetowrite == "donor": - umFile.write( + umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" + ";".join(varcon.get_unmapped_donor_mate_ids()) - ) + + "\n") except IOError: self.vaselogger.warning("Could not write read identifiers of " "reads with unmapped mates to " f"{umfileloc}") - # Writes the unmapped mate id of the acceptor context. - def write_acceptor_unmapped_mates(self, umfileloc): - try: + def write_acceptor_unmapped_mates(self, umfileloc, vbuuid): + """Writes the acceptor context read identifiers that have unmapped mates to an output file. + Parameters + ---------- + umfileloc : str + Path to write the output file to + """ + try: with open(umfileloc, "w") as umfile: + umfile.write(f"#VBUUID: {vbuuid}\n") umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): acccon = varcon.get_acceptor_context() umfile.write(str(acccon.get_context_id()) + "\t" + str(acccon.get_sample_id()) + "\t" - + ";".join(acccon.get_unmapped_read_mate_ids())) + + ";".join(acccon.get_unmapped_read_mate_ids()) + "\n") except IOError: self.vaselogger.warning("Could not write read identifiers of " "reads with unmapped mates to " f"{umfileloc}") - # Writes the unmapped mate id of the donor context. - def write_donor_unmapped_mates(self, umfileloc): + def write_donor_unmapped_mates(self, umfileloc, vbuuid): + """Writes the donor context read identifiers that have unmapped mates to an output file. + + Parameters + ---------- + umfileloc : str + Path to write the output file to + """ try: - with open(umfileloc, "w") as umFile: - umFile.write("#ContextId\tSampleId\tReadIds\n") + with open(umfileloc, "w") as umfile: + umfile.write(f"#VBUUID: {vbuuid}\n") + umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): doncon = varcon.get_donor_context() - umFile.write(str(doncon.get_context_id()) + "\t" + umfile.write(str(doncon.get_context_id()) + "\t" + str(doncon.get_sample_id()) + "\t" - + ";".join(doncon.get_unmapped_read_mate_ids())) + + ";".join(doncon.get_unmapped_read_mate_ids())+"\n") except IOError: self.vaselogger.warning("Could not write read identifiers of " "reads with unmapped mates to " f"{umfileloc}") - # Compares the current VariantContextFile to another variant context file def compare(self, othervarconfile, contextfilter=None): + """Compares the current variant context file to another provided variant context file. + + Parameters + ---------- + othervarconfile : VariantContextFile + VariantContextFile to compare against + contextfilter : list of str or None + Contexts to compare (inclusive filter) + + Returns + ------- + varcondiffs : dict + Differences between the current and other VariantContextFile + """ varcondiffs = {} for contextid in self.variant_contexts: if self.passes_filter(contextid, contextfilter): diffs = self.variant_contexts[contextid].compare(othervarconfile.get_variant_context(contextid)) varcondiffs[contextid] = diffs return varcondiffs + + def add_variant_context_file(self, variantcontextfile): + """Adds another VariantContextFile to the existing one. + + Variant contexts from the second file overlapping with variant contexts from the current file will not be added. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variant context file to add to the current + """ + for contextid, varcon in variantcontextfile.get_variant_contexts(True).items(): + if contextid not in self.variant_contexts: + if not self.context_collision(varcon.get_context()): + self.variant_contexts[contextid] = varcon diff --git a/VcfBamScanner.py b/VcfBamScanner.py index e47bb90..e9e589e 100644 --- a/VcfBamScanner.py +++ b/VcfBamScanner.py @@ -6,24 +6,61 @@ class VcfBamScanner: - # Constructor that creates two empty hashmaps (dicts). + """VcfBamScanner offers functionality to scan variant (VCF/BCF) and alignment (BAM/CRAM) files. + + The VcfBamScanner can be used to scan folders for variant and alignments files, check whether variant and alignment + files have sample names and extract them. A list file containing paths to either variant or alignment files can + also be.scanned. Files within the list files will be a saved per sample name in an identifier. + + Attributes + ---------- + vcf_sample_map : dict + Dictionary with variant files per sample + bam_sample_map : dict + Dictionary with alignment files per sample + valid_file_types : list of str + Valid variant and alignment file types + """ + def __init__(self): + """Obtains the vase logger and creates two empty dictionaries to save the variant and alignment files.""" self.vaselogger = logging.getLogger("VaSe_Logger") self.vcf_sample_map = {} self.bam_sample_map = {} self.valid_file_types = ["VCF", "BCF", "BAM", "CRAM"] # ===METHODS TO GET SAVED DATA FROM THE VCFBAMSCANNER====================== - # Returns the map that links VCF files and samples. def get_vcf_sample_map(self): + """Returns a dictionary with VCF/BCF files saved per sample. + + Returns + ------- + vcf_sample_map : dict + Paths to variant files per sample name + """ return self.vcf_sample_map - # Returns the map that links BAM files and samples. def get_bam_sample_map(self): + """Returns a dictionary with BAM/CRAM files saved per sample. + + Returns + ------- + bam_sample_map : dict + Paths to alignment files per sample name + """ return self.bam_sample_map - # Returns a map that links VCF files to BAM files. def get_vcf_to_bam_map(self): + """Returns a dictionary with a variant file linked to an alignment file of the same sample. + + For each sample, the associated variant file is linked to the associated.alignment file are linked via a + dictionary. The variant and alignment file are saved as key and value respectively. + + Returns + ------- + vcf_to_bam_map : dict + Variant files linked to alignment files + """ vcf_to_bam_map = {} for sampleid in self.vcf_sample_map: if sampleid in self.bam_sample_map: @@ -34,6 +71,23 @@ def get_vcf_to_bam_map(self): # Scans the folders containing VCF files and returns a map that # links sample ids with vcf files. def scan_vcf_folders(self, vcffolders): + """Scans a list of folders containing variant files and returns a dictionary with valid files per sample name. + + Iterates over a list of paths to folders containing VCF/BCF files. A non-existing folder is skipped. For each + variant file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a + dictionary. + + Parameters + ---------- + vcffolders : list + List of paths to folders contain alignment files + + Returns + ------- + vcf_sample_map : dict + String paths to variant files per sample name + + """ self.vaselogger.info("Start scannig VCF files") for vcffolder in vcffolders: @@ -68,9 +122,23 @@ def scan_vcf_folders(self, vcffolders): self.vaselogger.info("Finished scanning VCF files") return self.vcf_sample_map - # Scans the folders containing BAM files and returns a map that - # links sample ids with bam files. def scan_bam_folders(self, bamfolders): + """Scans a list of folders containing alignment files and returns a dictionary with valid files per sample name. + + Iterates over a list of paths to folders containing BAM/CRAM files. Non-existing folders are skipped. For each + alignment file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a + dictionary. + + Parameters + ---------- + bamfolders : list + List of String paths to folders containing alignment files + + Returns + ------- + bam_sample_map : dict + String paths to alignment files per sample name + """ self.vaselogger.info("Start scanning BAM files") # Scan BAM files in all provided folders. @@ -107,16 +175,53 @@ def scan_bam_folders(self, bamfolders): # Scans the provided BAM/CRAM files from a provided donor list file def scan_bamcram_files(self, bamcramlistfile): + """Reads a list file containing the locations of BAM/CRAM files and saves them per sample. + + Parameters + ---------- + bamcramlistfile : str + String path to list file with paths to alignment files + + Returns + ------- + bam_sample_map : dict + Paths to alignment files per sample name + """ self.bam_sample_map = self.read_donorlistfile(bamcramlistfile) return self.bam_sample_map # Scan the provided VCF files from a provided donor list file def scan_vcf_files(self, vcflistfile): + """Reads a list file containing the locations of VCF/BCF files and saves them per sample name. + + Parameters + ---------- + vcflistfile : str + String path to list file with paths to variant files + + Returns + ------- + vcf_sample_map : dict + Variant files per sample name + """ self.vcf_sample_map = self.read_donorlistfile(vcflistfile, "v") return self.vcf_sample_map - # Read a specific donor list file def read_donorlistfile(self, listfileloc, listtype="a"): + """Reads a list file containing paths to donor files and returns the donor files per sample name. + + Parameters + ---------- + listfileloc : str + Path to the list file containing paths to donor files + listtype : str + Type of variant files in the list file (a=alignment ; v=variant) + + Returns + ------- + donor_sample_files : dict + Paths to donor variant files per sample name + """ donor_sample_files = {} try: with open(listfileloc, "r") as listfile: @@ -135,29 +240,74 @@ def read_donorlistfile(self, listfileloc, listtype="a"): exit() return donor_sample_files - # Checks whether the BAM file contains a sample name. def bam_has_sample_name(self, bamfile): + """Checks and returns whether a BAM/CRAM file has a sample name/identifier. + + Parameters + ---------- + bamfile : pysam AlignmentFile + Already opened pysam AlignmentFile + + Returns + ------- + bool + True if file has a sample name, otherwise False + """ if "RG" in bamfile.header: if len(bamfile.header["RG"]) > 0: if "SM" in bamfile.header["RG"][0]: return True return False - # Checks whether the VCF file has a sample name def vcf_has_sample_name(self, vcffile): + """Checks and returns whether the VCF/BCF file has a sample name/identifier. + + Returns + ------- + bool + True if the variant file has sample names, otherwise False + """ if vcffile.header.samples[0] != "": return True return False # Returns the first sample identifier of the VCF file (as for now we only work with one sample per file def get_vcf_sample_name(self, vcffileloc): + """Extracts and returns the list of sample names from a specified VCF file. + + Parameters + ---------- + vcffileloc : str + Path to a VCF file + + Returns + ------- + list or None + VCF Sample name if present, otherwise None + """ vcffile = pysam.VariantFile(vcffileloc, "r") if self.vcf_has_sample_name(vcffile): return list(vcffile.header.samples) return None - # Returns the BAM sample def get_bam_sample_name(self, bamfileloc): + """Extracts and returns the sample name/identifier from a BAM file. + + The sample name is extracted by first checking whether the BAM file contains a sample name via the + VcfBamScanner method bam_has_sample_name(). If so, the sample name is extracted from the BAM file by returning + the value of the 'SM' field from the header line starting with '@RG'. None is returned if the BAM file has no + sample name. + + Parameters + ---------- + bamfileloc : str + Path to a BAM file + + Returns + ------- + str or None + Sample name if present, otherwise None + """ try: bamfile = pysam.AlignmentFile(bamfileloc, "rb") if self.bam_has_sample_name(bamfile): @@ -168,8 +318,24 @@ def get_bam_sample_name(self, bamfileloc): except IOError: self.vaselogger.warning(f"Could not open {bamfileloc} to extract sample identifier") - # Returns the CRAM sample name def get_cram_sample_name(self, cramfileloc): + """Extracts and returns the sample name/identifier from a specified CRAM file. + + The sample name is extracted by first checking whether the CRAM file contains a sample name via the + VcfBamScanner method cram_has_sample_name(). If so, the sample name is extracted from the CRAM file by + returning the value of the 'SM' field from the header line starting with '@RG'. None is returned if the CRAM + file has no sample name. + + Parameters + ---------- + cramfileloc : str + Path to a CRAM file + + Returns + ------- + str or None + Sample name if present, otherwise None + """ try: cramfile = pysam.AlignmentFile(cramfileloc, "rc") if self.bam_has_sample_name(cramfile): @@ -181,6 +347,20 @@ def get_cram_sample_name(self, cramfileloc): # General methods that returns one or more sample ids (in case of a VCF) def get_sample_id(self, donorfileloc, donorlisttype): + """Returns the sample name/identifier for a specified alignment or variant file. + + Parameters + ---------- + donorfileloc : str + Path to a donor variant/alignment file + donorlisttype : str + Type of donor file (a=alignment ; v=variant) + + Returns + ------- + str + File type of provided donor file + """ donor_file_type = self.get_donor_file_type(donorfileloc) if donorlisttype == "a": if donor_file_type[1] == "BAM": @@ -193,24 +373,71 @@ def get_sample_id(self, donorfileloc, donorlisttype): # Returns the filetype of a donor file def get_donor_file_type(self, donorfileloc): + """Determines and returns the file type of a specified donor file. + + To determine the file type, the linux command 'file' is used which returns a String containing the file type + info. The received String is split on spaces and the resulting list is returned. + + Returns + ------- + list of str + List with file type data + """ filetype_proc = subprocess.Popen(["file", "-b", "-z", donorfileloc], stdout=subprocess.PIPE) filetype_data, filetype_err = filetype_proc.communicate() return filetype_data.decode().split(" ") # Returns a list of sample identifiers that have both a VCF and a BAM/CRAM def get_complete_sample_ids(self): + """Determines which samples have a variant and alignment file and returns the list of identifiers. + + Returns + ------- + set + Set of sample names with a variant and alignment file + """ return set(self.vcf_sample_map.keys()) & set(self.bam_sample_map.keys()) # Extracts the sequence names from an already opened BAM/CRAM file def get_alignment_sequence_names(self, alignment_file): + """Extracts and returns the chromosome names from an alignment file. + + Chromosome names are extracted from a specified BAM or CRAM file by checking the headers for the 'SQ' tag. If + present the SQ entry is checked for the 'SN' field which contains the specific chromosome names. + + Parameters + ---------- + alignment_file : pysam.AlignmentFile + Already opened pysam AlignmentFile + + Returns + ------- + sequence_names : list + List with extracted chromosome names + """ sequence_names = set() if "SQ" in alignment_file.header: for sn_entry in alignment_file.header["SQ"]: sequence_names.add(sn_entry["SN"]) return sequence_names - # Extracts the sequence names from an already opened VCF/BCF file def get_variant_sequence_names(self, variant_file): + """Extracts and returns the chromosome names from a variant file. + + Chromosome names are extracted from a specified VCF or BCF file by obtaining the header contig names via the + pysam method .header.contigs(). The provided variant file is expected to be an already opened pysam + VariantFile. + + Parameters + ---------- + variant_file : pysam.VariantFile + Already opened pysam VariantFile + + Returns + ------- + sequence_names : list of str + List with extracted chromosome names + """ sequence_names = set() if len(variant_file.header.contigs) > 0: for seqname in list(variant_file.header.contigs): diff --git a/VcfVariant.py b/VcfVariant.py index f705a37..5a0c213 100644 --- a/VcfVariant.py +++ b/VcfVariant.py @@ -2,6 +2,23 @@ class VcfVariant: + """Saves necessary data of a genomic variant from a VCF/BCF file. + + Attributes + ---------- + vcf_variant_chrom : str + Chromosome name the variant is located on + vcf_variant_start : int + The leftmost genomic position of the variant + vcf_variant_ref : str + The reference allele of the variant + vcf_variant_alts : tuple + The alternative allele(s) of the variant + vcf_variant_filter : str + Filter applied to the variant (i.e. PASS) + vcf_variant_type : str + Type of the variant (SNP/Indel/etc) + """ def __init__(self, varchrom, varstart, varref, varalts, varfilter, vartype): self.vcf_variant_chrom = varchrom self.vcf_variant_start = varstart @@ -10,59 +27,147 @@ def __init__(self, varchrom, varstart, varref, varalts, varfilter, vartype): self.vcf_variant_filter = varfilter self.vcf_variant_type = vartype - # Returns the chromosome name of the VCF variant def get_variant_chrom(self): + """Returns the chromosome name the variant is located on + + Returns + ------- + self.vcf_variant_chrom : str + Chromosome name the variant is located on + """ return self.vcf_variant_chrom - # Sets the VCF variant chromosome def set_variant_chrom(self, varchrom): + """Sets the chromosome name the variant is located on. Overwrites an already set chromosome name. + + Parameters + ---------- + varchrom : str + Chromosome name the variant is located on + :return: + """ self.vcf_variant_chrom = varchrom - # Returns the VCF variant start position def get_variant_pos(self): + """Returns the genomic position of the variant. + + Returns + ------- + self.vcf_variant_pos : nt + Genomic position of the variant + """ return self.vcf_variant_start - # Sets the VCF variant starting position def set_variant_pos(self, varpos): + """Sets the variant genomic position. Overwrites an already set genomic position. + + Parameters + ---------- + varpos : int + Genomic position of the variant. + """ self.vcf_variant_start = varpos - # Returns the VCF variant reference allele def get_variant_ref_allele(self): + """Returns the reference allele of the variant. + + Returns + ------- + self.vcf_variant_ref : str + Reference allele of the variant + :return: + """ return self.vcf_variant_ref - # Sets the VCF variant reference allele def set_variant_ref_allele(self, varref): + """Sets the reference allele of the variant. Overwrites an already set reference allele. + + Parameters + ---------- + varref : str + Reference allele of the variant + """ self.vcf_variant_ref = varref - # Returns the VCF variant alternative alleles def get_variant_alt_alleles(self): + """Returns the alternative allele(s) of the variant. + + Returns + ------- + self.vcf_variant_alts : tuple + Alternative allele(s) of the variant + """ return self.vcf_variant_alts - # Sets the VCF variant alternative alleles def set_variant_alt_alleles(self, varalts): + """Sets the alternative allele(s) of the variants. Overwrites an already set alternative allele(s). + + Parameters + ---------- + varalts : tuple + Alternative allele(s) of the variant + """ self.vcf_variant_alts = varalts - # Returns the VCF variant filter def get_variant_filter(self): + """Returns the filter (i.e. PASS) that was applied to the variant + + Returns + ------- + self.vcf_variant_filter : str + + """ return self.vcf_variant_filter - # Sets the VCF variant filter def set_variant_filter(self, varfilter): + """Sets the filter applied to the variant. Overwrites any already set variant filter. + + Parameters + ---------- + varfilter : str + + """ self.vcf_variant_filter = varfilter - # Returns the VCF variant type (SNP/Indel, etc) def get_variant_type(self): + """Returns the variant type. + + Returns + ------- + self.vcf_variant_type : str + Variant type + """ return self.vcf_variant_type - # Sets the VCF variant type def set_variant_type(self, vartype): + """Sets the type of the variant. Overwrites any already set variant type. + + Parameters + ---------- + vartype : str + """ self.vcf_variant_type = vartype - # Returns a variant identifier in the form CHROM_POS def get_variant_id(self): + """Constructs and returns a variant identifier. + + The identifier is formed by combining the chromosome name and variant position, separated by an '_' + (i.e. CHROM_POS). This will be used as the identifier for contexts. + + Returns + ------- + str + Constructed variant identifier + """ return f"{self.vcf_variant_chrom}_{self.vcf_variant_start}" - # ToString method def to_string(self): + """Returns a String representation of the variant. + + Returns + ------- + str + String representation of the variant + """ return f"{self.vcf_variant_chrom}\t{self.vcf_variant_start}\t{self.vcf_variant_type}\t{self.vcf_variant_ref}" \ f"\t{self.vcf_variant_alts}\t{self.vcf_variant_filter}" diff --git a/combine_varcons.py b/combine_varcons.py new file mode 100644 index 0000000..368f58c --- /dev/null +++ b/combine_varcons.py @@ -0,0 +1,79 @@ +import os +import argparse + + +# Returns the command line parameters +def get_parameters(): + combine_varcon_params = argparse.ArgumentParser() + combine_varcon_params.add_argument("-l", "--listfile", dest="listfile", + help="Listfile containing locations to variant context files to combine") + combine_varcon_params.add_argument("-i", "--infiles", dest="infiles", nargs="+", + help="Variant context files to combine.") + combine_varcon_params.add_argument("-o", "--out", dest="outfile") + combine_varcon_args = vars(combine_varcon_params.parse_args()) + return combine_varcon_args + + +# Reads the variant context list file +def read_varcon_list_file(varcon_listfile): + varcon_files = [] + try: + with open(varcon_listfile, 'r') as vclf: + for fileline in vclf: + if os.path.isfile(fileline.strip()): + varcon_files.append(fileline.strip()) + except IOError: + print(f"Could not read variant context list file: {varcon_listfile}") + finally: + return varcon_files + + +def combine_varcons(varconfiles): + varcondata = {} + for varconfileloc in varconfiles: + varcondata = process_varconfile(varconfileloc, varcondata) + return varcondata + + +def process_varconfile(varconfileloc, varcondata): + with open(varconfileloc, 'r') as vrcfile: + for fileline in vrcfile: + if not fileline.startswith("#"): + varconline = fileline.strip() + filelinedata = fileline.strip().split("\t") + + if not detect_collision(filelinedata, varcondata): + varcondata[filelinedata[0]] = varconline + return varcondata + + +def detect_collision(varconentry, varcondatamap): + if varconentry[0] in varcondatamap: + return True + else: + for varcon in varcondatamap.values(): + if varcon[2] == varconentry[2]: + if int(varcon[4]) <= int(varconentry[5]) and int(varconentry[4]) <= int(varcon[5]): + return True + return False + + +def write_new_variantcontextfile(outfileloc, varcondatamap): + with open(outfileloc, 'w') as vrcoutfile: + vrcoutfile.write("#ContextId\tDonorSample\tChrom\tOrigin\tStart\tEnd\tAcceptorContextLength\t" + "DonorContextLength\tAcceptorReads\tDonorReads\tADratio\tAcceptorReadsIds\tDonorReadIds\n") + for varcondataentry in varcondatamap.values(): + vrcoutfile.write(f"{varcondataentry}\n") + + +# Do the actual work +cli_params = get_parameters() +varcons_to_combine = [] +if cli_params['listfile'] is not None: + varcons_to_combine = read_varcon_list_file(cli_params['listfile']) +else: + varcons_to_combine = [infile for infile in cli_params['infiles'] if os.path.isfile(infile)] + +varcondata = combine_varcons(varcons_to_combine) +write_new_variantcontextfile(cli_params['outfile'], varcondata) + diff --git a/config/examples/example_amode.cfg b/config/examples/example_amode.cfg index fbc71c0..90fed3e 100644 --- a/config/examples/example_amode.cfg +++ b/config/examples/example_amode.cfg @@ -1,8 +1,8 @@ -#Example config file for running VaSeBuilder in A-mode -RUNMODE=A +#Example config file for running VaSeBuilder in AC-mode +RUNMODE=AC TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq VARCONIN=/testdata/variantcontext/varcon.txt DONORFASTQS=/testdata/donorfastq/donorfastq_list.txt OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_dcmode.cfg b/config/examples/example_dcmode.cfg index c50f543..f9b1724 100644 --- a/config/examples/example_dcmode.cfg +++ b/config/examples/example_dcmode.cfg @@ -6,3 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fasta VARCONIN=/testdata/variantcontext/varcon.txt OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_dmode.cfg b/config/examples/example_dmode.cfg index 41c201d..92a332d 100644 --- a/config/examples/example_dmode.cfg +++ b/config/examples/example_dmode.cfg @@ -1,7 +1,8 @@ #Example config file for running VaSeBuilder in D-mode RUNMODE=D DONORVCF=/testdata/donordata/vcffiles_list.txt -DONORBAM= +DONORBAM=/testdata/donordata/bamfiles_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_fcmode.cfg b/config/examples/example_fcmode.cfg index 244766b..532c21b 100644 --- a/config/examples/example_fcmode.cfg +++ b/config/examples/example_fcmode.cfg @@ -3,8 +3,9 @@ RUNMODE=FC DONORVCF=/testdata/donordata/vcffile_list.txt DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam -TEMPLATEFQ1=/testdata/templatefastq/acceptor_R2.fq -TEMPLATEFQ2=/testdata/templatefastq/acceptor_R1.fq +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1_L1.fq , /testdata/templatefastq/acceptor_R1_L2.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2_L1.fq , /testdata/templatefastq/acceptor_R2_L2.fq VARCONIN=/testdata/variantcontext/varcon.txt REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_fmode.cfg b/config/examples/example_fmode.cfg index 5c7268c..65015c4 100644 --- a/config/examples/example_fmode.cfg +++ b/config/examples/example_fmode.cfg @@ -3,7 +3,8 @@ RUNMODE=F DONORVCF=/testdata/donordata/vcffile_list.txt DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam -TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq -TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1_L1.fq , /testdata/templatefastq/acceptor_R1_L2.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2_L1.fq , /testdata/templatefastq/acceptor_R2_L2.fq REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log \ No newline at end of file diff --git a/config/examples/example_fmode_debug.cfg b/config/examples/example_fmode_debug.cfg new file mode 100644 index 0000000..0a3371f --- /dev/null +++ b/config/examples/example_fmode_debug.cfg @@ -0,0 +1,11 @@ +#Example config file for running VaSeBuilder in F-mode with debug set on +RUNMODE=F +DONORVCF=/testdata/donordata/vcffile_list.txt +DONORBAM=/testdata/donordata/bamfile_list.txt +ACCEPTORBAM=/testdata/acceptor/acceptor.bam +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq +REFERENCE=/testdata/reference/genome_ref.fasta +OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log +DEBUG=T \ No newline at end of file diff --git a/config/examples/example_fmode_variantlist.cfg b/config/examples/example_fmode_variantlist.cfg new file mode 100644 index 0000000..f7b133d --- /dev/null +++ b/config/examples/example_fmode_variantlist.cfg @@ -0,0 +1,12 @@ +#Example config file for running VaSeBuilder in F-mode +#This example has a provided list of variants to use for the run +RUNMODE=F +DONORVCF=/testdata/donordata/vcffile_list.txt +DONORBAM=/testdata/donordata/bamfile_list.txt +ACCEPTORBAM=/testdata/acceptor/acceptor.bam +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq +REFERENCE=/testdata/reference/genome_ref.fasta +OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log +VARIANTLIST=/testdata/variants/variantlist.txt diff --git a/config/examples/example_pcmode.cfg b/config/examples/example_pcmode.cfg index 828d906..668ab65 100644 --- a/config/examples/example_pcmode.cfg +++ b/config/examples/example_pcmode.cfg @@ -6,4 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fasta VARCONIN=/testdata/variantcontext/varcon.txt OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_pmode.cfg b/config/examples/example_pmode.cfg index 81c1651..39377dd 100644 --- a/config/examples/example_pmode.cfg +++ b/config/examples/example_pmode.cfg @@ -5,4 +5,4 @@ DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fa OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_xcmode.cfg b/config/examples/example_xcmode.cfg index aa5854b..3bcdb66 100644 --- a/config/examples/example_xcmode.cfg +++ b/config/examples/example_xcmode.cfg @@ -6,4 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fa VARCONIN=/testdata/variantcontext/varcon.txt OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_xmode.cfg b/config/examples/example_xmode.cfg index 207199d..8ae6adc 100644 --- a/config/examples/example_xmode.cfg +++ b/config/examples/example_xmode.cfg @@ -5,4 +5,4 @@ DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fa OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/docs/input_formats.md b/docs/input_formats.md new file mode 100644 index 0000000..5c6f83f --- /dev/null +++ b/docs/input_formats.md @@ -0,0 +1,74 @@ +# VaSeBuilder input file formats + +## About input files +To run VaSeBuilder, specific VaSeBuilder input files might need to be provided depending on the selected run mode.
+
+ + +### Donor variant and donor alignment list files +The donor variant list file should contain a list of paths to a variant, VCF or BCF, file with one path per file line. +The donor alignment list file should be similar and contain one path to a donor alignment file per file line. +The files listed will be used as donor variant and alignments files respectively when constructing variant contexts. +Only paths to the variant/alignment files should be provided, not the index files. These are assumed to be in the same +directory with the same name, i.e.: sample.bam & sample.bam.bai +

+ + +### Donor fastq list file +A donor fastq list file needs to be provided when running VaSeBuilder AC-mode. Currently (September 23rd, 2019) this +file requires two columns without a header. The first column should contain the paths to R1 files, the second column + the paths to R2 files: + +_/path/to/sample1_R1.fq  /path/to/sample1_R2.fq
+/path/to/sample2_R1.fq  /path/to/sample2_R2.fq
+/path/to/sample3_R1.fq  /path/to/sample3_R2.fq_ + +Each row in this file therefore represents a single sample. +

+ + +### Variant list file +A variant list file can be provided as a filter to indicate which donor variants to use in the VaSeBuilder run. This is +most useful when only certain variants from the donors are required but the donor variant files contain many variants. +Variants not specified in the variant list will therefore be skipped. +The variant list file needs at least three columns: sample name/identifier, chromosome name, variant position: + +_Sample1  1  101
+Sample2  2  202
_ + +(In the future two more columns might be required: variant reference allele, variant alternative allele) +

+ + +### Variant context file +A VaSeBuilder variant context file can serve as an input file when running AC, DC, FC or PC mode. Please see the output +files description for the variant context file for it's structure. +

+ + +### Configuration file +Configuration files allow users to save the VaSeBuilder run parameters and values in a text file. This makes rerunning +analyses on the same data, as well as sharing the run parameters much easier. Parameters and values can be specified as +PARAMETERNAME=value. Parameter names do not need to be capitalized, entries such as parametername = value are also +valid. Parameters templatefq1 and templatefq2 can have multiple values separated by a comma: +templatefq1 = testdata/fastqs/acceptor1_1.fq , testdata/fastqs/acceptor1_2.fq. (The space around the comma is optional) +Comments explaining or describing the use of the configuration file can be added by starting a line with #. + +
Valid configuration file parameter names: +* __RUNMODE:__ The run mode of VaSeBuilder to use +* __DONORVCF:__ Path to file with list of donor variant files to use +* __DONORBAM:__ Path to file with list of donor alignment files to use +* __ACCEPTORBAM:__ Path the alignment file to use as acceptor +* __TEMPLATEFQ1:__ Path(s) to R1 fastq file(s) to use as template for building the validation set +* __TEMPLATEFQ2:__ Path(s) to R2 fastq file(s) to use as template for building the validation set +* __REFERENCE:__ Path to the fasta genome reference +* __OUT:__ Path to directory to write output files to +* __LOG:__ Path to write log file to +* __DEBUG:__ Run debug mode (valid values: T, True, true, TRUE, 1). +* __VARIANTLIST:__ Path to file containing variants to use +* __VARCONIN:__ Path to a variant context file +* __FASTQOUT:__ Name prefix for validation fastq files +* __VARCON:__ Name prefix for variant context output file + +To run VaSeBuilder with a configuration file use: python vase.py –c path/to/config/file.cfg +Please see the example config files in VaSeBuilder/config/examples for examples per run mode diff --git a/docs/output_formats.md b/docs/output_formats.md new file mode 100644 index 0000000..8903cdf --- /dev/null +++ b/docs/output_formats.md @@ -0,0 +1,178 @@ +# VaSeBuilder output file formats + +## Standard output files +VaSeBuilder runs output several output files containing the essential data about the run. Some files are only outputted +when running specific modes.

+ + + +### Donor alignment files +A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. A list could be: +

+_/path/to/donor1.bam
+/path/to/donor2.cram
+/path/to/donor3.bam_ +

+ + +### Donor insert positions +VaSeBuilder add donor reads at semi random positions in the resulting validation set. The recorded position is the +position of a full read (in a fastq a donor read with insert position 1 would be written to lines 5,6,7 and 8). +
+ +_\#VBUUID {vbuuid}
+ReadId  R1_InsertPos  FastqR1Out  R2_InsertPos  FastqR2Out
+dReadId1  15  /testdata/vase_R1.fastq   15  /testdata/vase_R2.fastq_ +

+ + +### Donor variant files +A list of paths to donor variant (VCF/BCF) files used in building the variant contexts. A list could be:

+_/path/to/donor1.vcf
+/path/to/donor2.bcf
+/path/to/donor3.vcf_ +

+ + +### Log file +Contains a log of steps and activities performed by VaSeBuilder. More info is added to the log when debug is activated. +The log file contains four fields separated by a tab: Date and time, name of the logger (VaSe_Logger), log level, +message. The first two lines of the log always display the issued command to run VaSeBuilder and the VaSeBuilder uuid +and creation date and time A VaSeBuilder log file therefore follows the format:
+ +_YYYY-MM-DD  HH:MM:SS,SSS  INFO  python vase.py -c test.cfg
+YYYY-MM-DD  HH:MM:SS,SSS  INFO  VaSeBuilder: {uuid} ; YYYY-MM-DD HH:MM:SS,SSS
+YYYY-MM-DD  HH:MM:SS,SSS  INFO  Running VaSeBuilder in F-mode_ +

+ + +### P-mode link file +When VaSeBuilder is run in P-mode, donor fastq files are outputted for each variant context. To link the variant +context file and the outputted donor fastq files, an extra file is created consisting of three columns: +

+_contextid_1  /path/to/fastq_R1.fq  /path/to/fastq_R1.fq
+contextid_2  /path/to/fastq_R2.fq  /path/to/fastq_R2.fq_ +
+ +Each line represents a variant context. The first line in the P-mode link file is the uuid of the VaSeBuilder run. This +is used to the variant context file and the fastq files for a certain P-mode run. +

+ + +### Validation fastq files +Validation fastq files are valid fastq files based on the provided template fastq files with acceptor and donor reads +exchanged based on the variant contexts. Currently (September 24th, 2019) donor reads are added at the end of the +validation fastq files. +

+ + +### Variant context file +The variant context file is a tab separated file containing the essential data of the variant contexts in 13 columns. +This file is one of the important output files as it contains the windows created to search reads and which acceptor +reads are exchanged for which donor reads. The context data is preceded by a header starting with a #. The default +output name for the variant context file is _varcon.txt_.
+ +Variant context file columns: +* __ContextId:__ The identifier of the context (consists of the chromosome name and variant positions connected with +an '_') +* __DonorSample:__ The sample name/identifier of the variant from which the context is constructed. +* __Chrom:__ The name of the chromosome the context is located on. +* __Origin:__ This is the genomic position of the donor variant the acceptor, donor and variant context +* __Start:__ The starting, or leftmost genomic, position of the variant context. +* __End:__ The ending, or rightmost genomic, position of the variant context. +* __AcceptorContextLength:__ The length of the acceptor context that constructed the variant context. +* __DonorContextLength:__ The length of the donor context that constructed the variant context. +* __AcceptorReads:__ The number of acceptor reads and mates overlapping with the variant context. +* __DonorReads:__ The number of donor reads and mates overlapping with the variant context. +* __ADratio:__ The ratio of acceptor reads over donor reads that will be exchanged. +* __AcceptorReadIds:__ The read identifiers (one per mate pair) overlapping with the variant context. +* __DonorReadsIds:__ The read identifiers (one per mate pair) overlapping with the variant context. + +The first line in each variant context file is the uuid of the VaSeBuilder run that constructed the variant contexts. A +variant context file looks like:
+ +_#VBUUID: {uuid}
+\#ContextId  DonorSample  Chrom  Origin  Start  End  AcceptorContextLength + DonorContextLength  AcceptorReads  DonorReads  ADratio  AcceptorReadsIds + DonorReadIds
+1_100  Sample1  1  100  50  150  75  75  2  2  1.0 + aReadId1,aReadId2  dReadId1,dReadId2_ +

+ + +### Variant context statistics file +The variant context statistics file contains some statistics about the reads and mates overlapping with the variant +contexts. Per variant context the read statistics consists, in order, of the average and median read length, q-score +and mapq values for acceptor and donor reads overlapping with the variant context. +

+_#ContextId  Avg_ALen  Avg_DLen  Med_ALen  Med_DLen  Avg_AQual  Avg_DQual  Med_AQual + Med_DQual  Avg_AMapQ  Avg_DMapQ  Med_AMapQ  Med_DMapQ
+1_100  151  151  151.0  151.0  36.5  35.6  37.0  38.0  54.6  55.7 + 60.0  60.0_ +

+ + +### Variant context bed file +A BED file with the variant context entries in four columns: Chromosome name, context start, context end, context +identifier. The output name for the BED file is _variantcontexts.bed_. The resulting file looks like:
+ +_1  100  1000  1_500
+2  200  2000  2_1000_ +

+ + +## Debug output files +When debug is set to True multiple extra files are outputted as well providing more in depth information. +

+ + +### Variant context acceptor/donor read positions file +Much like the acceptor/donor read positions the variant context acceptor positions and variant context donor positions +files save the leftmost R1 and rightmost R2 acceptor and donor read positions respectively. +

+ + +### Variant context unmapped acceptor/donor read mates file +Records the identifiers of reads that have an unmapped read mate in a tab separated manner. The file contains three +columns: Context identifier, sample name/identifier, read identifiers.

+ +_#ContextId  SampleId  ReadIds
+1_100 Sample1  uReadId1,uReadId2,uReadId3_ + + +### Acceptor/Donor contexts file +The acceptor/donor context file is essentially a variant context file but for acceptor or donor contexts respectively. +The acceptor/donor context file differs in the number of columns and only has two additional columns after ContextId - +End, namely NumOfReads and ReadIds. + +_VBUUID: {uuid}
+\#ContextId  DonorSample  Chrom  Origin  Start  End  AcceptorContextLength + DonorContextLength  NumOfReads  ReadIds
+1_100  Sample1  1  100  50  150  2  dReadId1,dReadId2_ +

+ + +### Acceptor/Donor context statistics file +Similar to the variant context statistics file, the acceptor/donor context statistics file contains basic statistics of +the reads overlapping with acceptor or donor contexts. +

+_#ContextId &empsp;Avg_ReadLen  MedReadLen  AvgReadQual  Med_ReadQual  AvgReadMapQ + MedReadMapQ
+1_100  151  151.0  37.9  39.0  57.1  60.0_ +

+ + +### Acceptor/Donor context read positions file +The read positions contains the leftmost genomic positions of all R1 reads and rightmost genomic positions of all R2 +reads in all acceptor or donor contexts. These positions can be plotted, if needed, to gain more insight into the outer +ranges of the reads associated with a variant context. The file contains three columns: context identifier, left +positions, right positions. Each leftmost and rightmost position is separated by a comma.

+_#ContextId  LeftPos  RightPos
+1_100  25,50,75  75,50,25_ +

+ + +### Acceptor/Donor context unmapped read mates file +Reads overlapping with the acceptor/donor contexts with unmapped read mates encountered during a VaSeBuilder run are +written the the acceptor_unmapped.txt or donor_unmapped.txt +

diff --git a/tests/TestDonorBamRead.py b/tests/TestDonorBamRead.py index 0c8a672..3a1f3f5 100644 --- a/tests/TestDonorBamRead.py +++ b/tests/TestDonorBamRead.py @@ -6,17 +6,25 @@ class TestDonorBamRead(unittest.TestCase): # Creates the answer variables and the DonorBamRead used for testing def setUp(self): self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522" + self.read_flag_answer = "143" self.read_pn_answer = "1" self.read_chrom_answer = "21" self.read_pos_answer = 9411193 self.read_len_answer = 151 + self.read_end_answer = 9411331 + self.read_cigarstring_answer = "13S138M" + self.read_rnext_answer = "21" + self.read_pnext_answer = 9411300 + self.read_tlen_answer = 4398 self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \ "CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA" self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \ ">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>" self.read_map_q_answer = 40 - self.dbam_read = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.dbam_read = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_map_q_answer) # ====================PERFORM THE TESTS FOR THE GETTER METHODS==================== @@ -24,12 +32,16 @@ def test_get_bam_read_id(self): self.assertEqual(self.dbam_read.get_bam_read_id(), self.read_id_answer, "Both BAM read identifiers should have " f"been {self.read_id_answer}") + def tet_get_bam_read_flag(self): + self.assertEqual(self.dbam_read.get_bam_read_flag(), self.read_flag_answer, + f"Both read flag should have been {self.read_flag_answer}") + def test_get_bam_read_pair_number(self): - self.assertEquals(self.dbam_read.get_bam_read_pair_number(), self.read_pn_answer, "Both read pair numbers " + self.assertEqual(self.dbam_read.get_bam_read_pair_number(), self.read_pn_answer, "Both read pair numbers " f"should have been {self.read_pn_answer}") def test_get_bam_read_chrom(self): - self.assertEquals(self.dbam_read.get_bam_read_chrom(), self.read_chrom_answer, "Both read chromosomes should " + self.assertEqual(self.dbam_read.get_bam_read_chrom(), self.read_chrom_answer, "Both read chromosomes should " f"have been {self.read_chrom_answer}") def test_get_bam_read_ref_pos(self): @@ -41,9 +53,28 @@ def test_get_bam_read_length(self): f"been {self.read_len_answer}") def test_get_bam_read_ref_end(self): - read_end_answer = 9411344 - self.assertEqual(self.dbam_read.get_bam_read_ref_end(), read_end_answer, "Both read end positions should have " - f"been {read_end_answer}") + self.assertEqual(self.dbam_read.get_bam_read_ref_end(), self.read_end_answer, "Both read end positions should have " + f"been {self.read_end_answer}") + + def test_get_bam_read_cigar(self): + self.assertEqual(self.dbam_read.get_bam_read_cigar(), self.read_cigarstring_answer, + f"Both read CIGAR strings should have been {self.read_cigarstring_answer}") + + def test_get_bam_read_rnext(self): + self.assertEqual(self.dbam_read.get_bam_read_rnext(), self.read_rnext_answer, + f"Both read RNEXT values should have been {self.read_rnext_answer}") + + def test_get_bam_read_pnext(self): + self.assertEqual(self.dbam_read.get_bam_read_pnext(), self.read_pnext_answer, + f"Both read PNEXT values should have been {self.read_pnext_answer}") + + def test_get_bam_read_tlen(self): + self.assertEqual(self.dbam_read.get_bam_read_tlen(), self.read_tlen_answer, + f"Both read TLEN values should have been {self.read_tlen_answer}") + + def test_get_bam_read_sequence(self): + self.assertEqual(self.dbam_read.get_bam_read_sequence(), self.read_seq_answer, + f"Both read sequences should have been {self.read_seq_answer}") def test_get_bam_read_qual(self): self.assertEqual(self.dbam_read.get_bam_read_qual(), self.read_quals_answer, "Both read qualities should have " @@ -64,6 +95,10 @@ def test_get_bam_read_mapq(self): self.assertEqual(self.dbam_read.get_mapping_qual(), self.read_map_q_answer, "Both MapQ values should have been " f"{self.read_map_q_answer}") + def test_read_has_hardclipped_bases(self): + self.assertFalse(self.dbam_read.read_has_hardclipped_bases(), + f"Read with {self.read_cigarstring_answer} should not have hard clipped bases.") + # ====================PERFORM THE TESTS FOR THE STATISTICS METHODS==================== def test_get_average_q_score(self): avg_qscore_answer = 28.490066225165563 @@ -100,3 +135,11 @@ def test_get_as_fast_q_seq_nopairnum(self): fastq_answer = f"@{self.read_id_answer}\n{self.read_seq_answer}\n+\n{self.read_quals_answer}\n" self.assertEqual(self.dbam_read.get_as_fastq_seq(), fastq_answer, "Both answers should have been " f"{fastq_answer}") + + def test_get_as_bam_read(self): + bamcram_entry = f"{self.read_id_answer}\t{self.read_flag_answer}\t{self.read_chrom_answer}\t" \ + f"{self.read_pos_answer}\t{self.read_map_q_answer}\t{self.read_cigarstring_answer}\t" \ + f"{self.read_rnext_answer}\t{self.read_pnext_answer}\t{self.read_tlen_answer}\t{self.read_seq_answer}" \ + f"{self.read_quals_answer}" + self.assertEqual(self.dbam_read.get_as_bam_read(), bamcram_entry, + f"The returned entry shoud have been {bamcram_entry}") diff --git a/tests/TestOverlapContext.py b/tests/TestOverlapContext.py index 6f12d6f..371ca54 100644 --- a/tests/TestOverlapContext.py +++ b/tests/TestOverlapContext.py @@ -8,23 +8,35 @@ class TestOverlapContext(unittest.TestCase): def setUp(self): # Create three DonorBamRead objects to add to the overlap context self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522" + self.read_flag_answer = "143" self.read_pn_answer = "1" self.read_chrom_answer = "21" self.read_pos_answer = 9411193 self.read_len_answer = 151 + self.read_end_answer = 9411331 + self.read_cigarstring_answer = "13S138M" + self.read_rnext_answer = "21" + self.read_pnext_answer = 9411300 + self.read_tlen_answer = 4398 self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \ "CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA" self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \ ">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>" self.read_mapq_answer = 40 - self.con_read1 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.con_read1 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_mapq_answer) - self.con_read2 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.con_read2 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_mapq_answer) - self.con_read3 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.con_read3 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_mapq_answer) # Create the overlap context to test @@ -36,7 +48,7 @@ def setUp(self): self.context_end_answer = 9411344 self.context_len_answer = 151 self.context_bam_reads_answer = [self.con_read1, self.con_read2, self.con_read3] - self.unmapped_answer = [] + self.unmapped_answer = ["uread_1", "uread_2", "uread_3"] self.overlap_context = OverlapContext(self.context_id_answer, self.context_sample_answer, self.context_chrom_answer, self.context_origin_answer, self.context_start_answer, self.context_end_answer, @@ -55,6 +67,12 @@ def setUp(self): self.avg_med_read_mapq_answer = [40, 40] # ====================PERFORM THE TESTS FOR THE GETTER METHODS==================== + def test_get_context(self): + context_answer = [self.context_chrom_answer, self.context_origin_answer, self.context_start_answer, + self.context_end_answer] + self.assertListEqual(self.overlap_context.get_context(), context_answer, + f"The returned context array should have been {context_answer}") + def test_get_context_id(self): self.assertEqual(self.overlap_context.get_context_id(), self.context_id_answer, "The context id should have " f"been {self.context_id_answer}") @@ -148,10 +166,10 @@ def test_get_context_bam_read_qualities(self): self.assertEqual(self.overlap_context.get_context_bam_read_qualities(), context_read_quals_answer, "The read " f"qualites should have been {context_read_quals_answer}") - #def test_getContextBamReadQScores(self): - #contextReadQScoresAnswer = self.qscoreAnswer + self.qscoreAnswer + self.qscoreAnswer - #self.assertListEqual(self.overlapContext.getContextBamReadQScores(), contextReadQScoresAnswer, - # f"The Q-scores should have been {contextReadQScoresAnswer}") + def test_get_context_bam_read_q_scores(self): + context_read_q_scores_answer = [self.qscore_answer] + [self.qscore_answer] + [self.qscore_answer] + self.assertListEqual(self.overlap_context.get_context_bam_read_q_scores(), context_read_q_scores_answer, + f"The Q-scores should have been {context_read_q_scores_answer}") def test_get_context_bam_read_map_qs(self): context_read_map_qs_answer = [40, 40, 40] @@ -167,6 +185,29 @@ def test_read_is_in_context_neg(self): self.assertFalse(self.overlap_context.read_is_in_context(read_in_context_answer_neg), "Read id " f"{read_in_context_answer_neg} should not have been in the context") + # ====================PERFORM THE TESTS FOR UNMAPPED MATE IDS==================== + def test_add_unmapped_mate_id(self): + read_id_to_add = "uReadId1" + self.overlap_context.add_unmapped_mate_id(read_id_to_add) + self.assertTrue(read_id_to_add in self.overlap_context.unmapped_read_mate_ids, + f"Read id {read_id_to_add} should have been in the list of reads with unmapped mates") + + def test_set_unmapped_mate_ids(self): + unmapped_read_ids = ["uReadId1", "uReadId2", "uReadId3"] + self.overlap_context.set_unmapped_mate_ids(unmapped_read_ids) + self.assertListEqual(self.overlap_context.unmapped_read_mate_ids, unmapped_read_ids, + f"The list of unmapped read mate ids should have been {unmapped_read_ids}") + + def test_read_has_unmapped_mate_true(self): + read_id_to_search = "uread_1" + self.assertTrue(self.overlap_context.read_has_unmapped_mate(read_id_to_search), + f"Read {read_id_to_search} should have an unmapped mate") + + def test_read_has_unmapped_mate_false(self): + read_id_to_search = "fReadId1" + self.assertFalse(self.overlap_context.read_has_unmapped_mate(read_id_to_search), + f"Read {read_id_to_search} should not have an unmapped mate") + # ====================PERFORM THE TESTS FOR THE OVERLAP CONTEXT STATISTICS==================== def test_get_average_and_median_read_length(self): self.assertListEqual(self.overlap_context.get_average_and_median_read_length(), self.avg_med_read_len_answer, diff --git a/tests/TestParamChecker.py b/tests/TestParamChecker.py index e4c42a8..bb520d9 100644 --- a/tests/TestParamChecker.py +++ b/tests/TestParamChecker.py @@ -355,3 +355,55 @@ def test_required_parameters_set_invalidxcmodeparam(self): set_parameters = {} self.assertFalse(self.param_check.required_parameters_set("XC", set_parameters), "The required parameters for XC mode should not have been ok and therefore return False") + + # =====TEST SOME OTHER METHODS===== + def test_get_required_runmode_parameters_acmode(self): + ac_params_answer = ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", "out"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("AC"), ac_params_answer, + f"The list of required AC-mode parameters should have been {ac_params_answer}") + + def test_get_required_runmode_parameters_dmode(self): + d_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("D"), d_params_answer, + f"The list of required D-mode parameters should have been {d_params_answer}") + + def test_get_required_runmode_parameters_dcmode(self): + dc_params_answer = ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"] + self.assertListEqual(self.param_check.get_required_runmode_parameters(""), dc_params_answer, + f"The list of required DC-mode parameters should have been {dc_params_answer}") + + def test_get_required_runmode_parameters_fmode(self): + f_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "templatefq1", "templatefq2", "out", + "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("F"), f_params_answer, + f"The list of required F-mode parameters should have been {f_params_answer}") + + def test_get_required_runmode_parameters_fcmode(self): + fc_params_answer = ["runmode", "donorvcf", "donorbam", "templatefq1", "templatefq2", "out", "reference", + "varconin"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("FC"), fc_params_answer, + f"the list of required FC-mode parameters should have been {fc_params_answer}") + + def test_get_required_runmode_parameters_pmode(self): + p_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("P"), p_params_answer, + f"The list of required parameters should have been {p_params_answer}") + + def test_get_required_runmode_parameters_pcmode(self): + pc_params_answer = ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("PC"), pc_params_answer, + f"The list of required parameters should have been {pc_params_answer}") + + def test_get_required_runmode_parameters_xmode(self): + x_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("X"), x_params_answer, + f"The list of required parameters should have been {x_params_answer}") + + def test_get_required_runmode_parameters_nonexistent(self): + self.assertListEqual(self.param_check.get_required_runmode_parameters("T"), [], + f"The list of required parameters for a non existent mode should have been empty.") + + def test_get_optional_parameters(self): + opt_params_answer = ["fastqout", "varcon", "variantlist", "seed"] + self.assertListEqual(self.param_check.get_optional_parameters(), opt_params_answer, + f"The list of optional parameters should have been {opt_params_answer}") diff --git a/tests/TestVaSeBuilder.py b/tests/TestVaSeBuilder.py index e2c977f..b30edcd 100644 --- a/tests/TestVaSeBuilder.py +++ b/tests/TestVaSeBuilder.py @@ -6,6 +6,7 @@ from unittest.mock import patch, mock_open, call import os import sys +import random import pysam # Import required class @@ -26,6 +27,7 @@ def setUp(self): self.filter_to_use = ["aap", "noot", "mies"] self.vcfvarlist = [] + # ====================PERFORM THE TESTS FOR THE FILTER METHOD==================== # Tests that a value is indeed in the inclusion filter def test_passes_filter_posincl(self): val_to_use = "noot" @@ -58,7 +60,7 @@ def test_get_sample_vcf_variants_pos(self): variant_list_answer.sort() obtained_variant_list = self.vs_builder.get_sample_vcf_variants(vcffile) - obtained_list = [x.to_strng() for x in obtained_variant_list] + obtained_list = [x.to_string() for x in obtained_variant_list] obtained_list.sort() self.assertListEqual(obtained_list, variant_list_answer, "The obtained list of reads should have been " @@ -83,7 +85,7 @@ def test_determine_variant_type_indel(self): # Tests that an indel is determined correctly def test_determine_variant_type_indel2(self): variant_ref = "C,CGATC" - variant_alts = ("C") + variant_alts = ("C",) variant_type_answer = "indel" self.assertEqual(self.vs_builder.determine_variant_type(variant_ref, variant_alts), variant_type_answer, "The determined variant type should have been an indel") @@ -177,24 +179,27 @@ def test_is_required_read_neg(self): "Should have been false for forward read") # Tests setting the fastq output path for the F (R1) file - def test_setFastqOutPath(self): - output_path = "/aap/noot/mies" - fastq_out_name = "" + def test_set_fastq_out_path_r1(self): + outprefix = "/test/outdata/VaSe" + lanenum = 3 + outpath_answer = f"{outprefix}_{datetime.now().date()}_L{lanenum}_R1.fastq" + self.assertEqual(self.vs_builder.set_fastq_out_path(outprefix, '1', lanenum), outpath_answer, + f"The returned output path for R1 should have been {outpath_answer}") + + def test_set_fastq_out_path_r2(self): + outprefix = "/test/outdata/VaSe" + lanenum = 3 + outpath_answer = f"{outprefix}_{datetime.now().date()}_L{lanenum}_R2.fastq" + self.assertEqual(self.vs_builder.set_fastq_out_path(outprefix, '2', lanenum), outpath_answer, + f"The returned output path for R2 should have been {outpath_answer}") # Tests that the creation id of the VaSeBuilder object is set correctly - def test_getCreationId(self): + def test_get_creation_id(self): creationid_answer = "piet" vasebuilder_obj = VaSeBuilder(creationid_answer) self.assertEqual(vasebuilder_obj.get_creation_id(), creationid_answer, f"The returned VaSeBuilder identifier should have been: {creationid_answer}") - # Tests that the creation date of the VaSeBuilder is set correctly. - def test_get_creation_date(self): - creation_data_answer = datetime.now().date() - vasebuilder_obj = VaSeBuilder("aap") - self.assertEqual(vasebuilder_obj.get_creation_date(), creation_data_answer, - f"The returned VaSeBuilder creation data should have been: {creation_data_answer}") - # Tests that a saved context for a specified variant is indeed returned. def test_get_variant_context_pos(self): self.vs_builder.variantContextMap = self.var_con_map @@ -211,22 +216,109 @@ def test_get_variant_context_neg(self): # = # Tests that a list of donor is divided without a remainder list - def test_divide_donorfastqs_over_acceptors_noremainder(self): + def test_divide_donorfastqs_over_acceptors_equal(self): donorfq_list = ['aR1.fq', 'bR1.fq', 'cR1.fq', 'dR1.fq', 'eR1.fq', 'fR1.fq'] - divlist_answer = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq', 'fR1.fq']] + divlist_answer = [['fR1.fq', 'cR1.fq'], ['eR1.fq', 'bR1.fq'], ['dR1.fq', 'aR1.fq']] self.assertListEqual(self.vs_builder.divide_donorfastqs_over_acceptors(donorfq_list, 3), divlist_answer, f"The returned divided donor fastq list should have been {divlist_answer}") # Tests that a list of donors is divided with a remainder list - def test_divide_donorfastqs_over_acceptors_withremainder(self): + def test_divide_donorfastqs_over_acceptors_unequal(self): donorfq_list = ['aR1.fq', 'bR1.fq', 'cR1.fq', 'dR1.fq', 'eR1.fq'] - divlist_answer = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq']] + divlist_answer = [['eR1.fq', 'cR1.fq', 'aR1.fq'], ['dR1.fq', 'bR1.fq']] self.assertListEqual(self.vs_builder.divide_donorfastqs_over_acceptors(donorfq_list, 2), divlist_answer, f"The returned divided donor fastq list should have been {divlist_answer}") - # Tests that the remainders are added to the proper lists. - def test_divide_remaining_donors(self): - divdonors = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq']] - cordivdonors = [['aR1.fq', 'bR1.fq', 'eR1.fq'], ['cR1.fq', 'dR1.fq']] - self.assertListEqual(self.vs_builder.divide_remaining_donors(divdonors), cordivdonors, - f"The correct divided donor fastq list should have been {cordivdonors}") + # ====================PERFORM TESTS FOR THE SHUFFLING METHODS==================== + # Tests that the shuffling of donor read works as expected + def test_shuffle_donor_read_identifiers(self): + read_ids = ["Read4", "Read3", "Read5", "Read1", "Read2"] + + shuffled_answer = ["Read3", "Read2", "Read4", "Read5", "Read1"] + shuffled_answer = read_ids.copy() + shuffled_answer.sort() + random.seed(2) + random.shuffle(shuffled_answer) + + self.assertListEqual(self.vs_builder.shuffle_donor_read_identifiers(read_ids, 2), shuffled_answer, + f"The correct shuffled donor read ids should have been {shuffled_answer}") + + def test_shuffle_donor_add_positions(self): + add_positions = [6, 8, 2, 0, 1] + shuffled_answer = add_positions.copy() + + def test_get_saved_insert_position_r1(self): + read_data = ('1', 100, '2', 100) + r1pos_answer = 100 + self.assertEqual(self.vs_builder.get_saved_insert_position('1', read_data), r1pos_answer, + f"The R1 read insert position should have been {r1pos_answer}") + + def test_get_saved_insert_position_r2(self): + read_data = ('1', 100, '2', 150) + r2pos_answer = 150 + self.assertEqual(self.vs_builder.get_saved_insert_position('2', read_data), r2pos_answer, + f"The R2 read insert position should have been {r2pos_answer}") + + def test_get_saved_insert_position_nor1(self): + read_data = ('2', 150) + r1pos_answer = 'NA' + self.assertEqual(self.vs_builder.get_saved_insert_position('1', read_data), r1pos_answer, + "The R1 read insert position should have been NA") + + def test_get_saved_insert_position_nor2(self): + read_data = ('1', 100) + r2pos_answer = 'NA' + self.assertEqual(self.vs_builder.get_saved_insert_position('2', read_data), r2pos_answer, + "The R2 read insert position should have been NA") + + # Adds a donor insert data position for a new fastq + def test_add_donor_insert_data_addnewfq(self): + donor_add_data = {} + fq_name = "ifq.fastq" + read_id = "iReadId1" + read_pairnum = "1" + insert_pos = 100 + donor_add_data_answer = {"ifq.fastq": [("1", 100)]} + self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data) + self.assertDictEqual(donor_add_data, donor_add_data_answer, + f"Both donor read add data should have been {donor_add_data_answer}") + + # Tests adding a new donor insert position data for an existing fastq + def test_add_donor_insert_data_addnewread(self): + donor_add_data = {"ifq.fastq": []} + fq_name = "ifq.fastq" + read_id = "iReadId1" + read_pairnum = "1" + insert_pos = 100 + donor_add_data_answer = {"ifq.fastq": [("1", 100)]} + self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data) + self.assertDictEqual(donor_add_data, donor_add_data_answer, + f"Both donor read add data should have been {donor_add_data_answer}") + + def test_add_donor_insert_data_addadditional(self): + donor_add_data = {"ifq.fastq": [("1", 100)]} + fq_name = "ifq.fastq" + read_id = "iReadId1" + read_pairnum = "2" + insert_pos = 100 + donor_add_data_answer = {"ifq.fastq": [("1", 100, "2", 100)]} + self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data) + self.assertDictEqual(donor_add_data, donor_add_data_answer, + f"Both donor read add data should have been {donor_add_data_answer}") + + def test_link_donor_addpos_reads_v2(self): + addposlist = [6, 8] + readidlist = ['dRead1', 'dRead2'] + donor_reads = [('dRead1', '1', 'ACAT', '<<>>'), ('dRead1', '2', 'TACA', '>><<'), + ('dRead2', '1', 'CAAT', '<><>'), ('dRead2', '2', 'TAAC', '><><')] + donor_addpos_link_answer = {6: [('dRead1', '1', 'ACAT', '<<>>'), ('dRead1', '2', 'TACA', '>><<')], + 8: [('dRead2', '1', 'CAAT', '<><>'), ('dRead2', '2', 'TAAC', '><><')]} + self.assertDictEqual(self.vs_builder.link_donor_addpos_reads_v2(addposlist, readidlist, donor_reads), + donor_addpos_link_answer, + f"Both donor read/add position link maps should have been {donor_addpos_link_answer}") + + def test_check_template_size(self): + num_of_template_reads = 56732 + obtained_template_size = self.vs_builder.check_template_size("testdata/fqDir/SRR1039513_1.fastq.gz") + self.assertEqual(obtained_template_size, num_of_template_reads, + f"The number of template reads was expected to be {num_of_template_reads}") diff --git a/tests/TestVariantContext.py b/tests/TestVariantContext.py index c24d87f..265e332 100644 --- a/tests/TestVariantContext.py +++ b/tests/TestVariantContext.py @@ -9,22 +9,33 @@ def setUp(self): # Create the variables saving the context BAM reads self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522" self.read_donor_id_answer = "HHKY2CCXX160108:1:2122:24160:2555" + self.read_flag_answer = "143" self.read_pn_answer = "1" self.read_donor_pn_answer = "2" self.read_chrom_answer = "21" self.read_pos_answer = 9411193 self.read_len_answer = 151 self.read_donor_len_answer = 108 + self.read_end_answer = 9411334 + self.read_cigarstring_answer = "13S138M" + self.read_donor_cigarstring_answer = "43H108M" + self.read_rnext_answer = "21" + self.read_pnext_answer = 9511300 + self.read_tlen_answer = 400 self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \ "CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA" self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \ ">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>" self.read_map_q_answer = 40 - self.acceptor_read = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.acceptor_read = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_map_q_answer) - self.donor_read = DonorBamRead(self.read_donor_id_answer, self.read_donor_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_donor_len_answer, self.read_seq_answer, + self.donor_read = DonorBamRead(self.read_donor_id_answer, self.read_flag_answer, self.read_donor_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_donor_len_answer, + self.read_end_answer, self.read_donor_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_map_q_answer) self.acceptor_read_ids_answer = ["HHKY2CCXX160108:1:2122:24160:2522"] diff --git a/tests/TestVcfVariant.py b/tests/TestVcfVariant.py index 911ea80..768f5e2 100644 --- a/tests/TestVcfVariant.py +++ b/tests/TestVcfVariant.py @@ -8,7 +8,7 @@ def setUp(self): self.variant_pos_answer = 100 self.variant_ref_answer = "C" self.variant_alts_answer = ("A", "G") - self.variant_filter_answer = ["PASS"] + self.variant_filter_answer = "PASS" self.variant_type_answer = "snp" self.variant_id_answer = f"{self.variant_chrom_answer}_{self.variant_pos_answer}" self.vcfvariant_obj_answer = VcfVariant(self.variant_chrom_answer, self.variant_pos_answer, @@ -58,14 +58,14 @@ def test_set_variant_alt_alleles(self): f"alleles should have been: {varalts_toset}") def test_get_variant_filter(self): - self.assertListEqual(self.vcfvariant_obj_answer.get_variant_filter(), self.variant_filter_answer, "Both variant" - f" filters should have been: {self.variant_filter_answer}") + self.assertEqual(self.vcfvariant_obj_answer.get_variant_filter(), self.variant_filter_answer, + f"Both variant filters should have been: {self.variant_filter_answer}") def test_set_variant_filter(self): - varfilter_touse = ['q10'] + varfilter_touse = "q10" self.vcfvariant_obj_answer.set_variant_filter(varfilter_touse) - self.assertListEqual(self.vcfvariant_obj_answer.get_variant_filter(), varfilter_touse, "The set variant filter " - f"should have been: {varfilter_touse}") + self.assertEqual(self.vcfvariant_obj_answer.get_variant_filter(), varfilter_touse, "The set variant filter " + f"should have been: {varfilter_touse}") def test_get_variant_type(self): self.assertEqual(self.vcfvariant_obj_answer.get_variant_type(), self.variant_type_answer, "Both variant types " diff --git a/vase.py b/vase.py index 05d328a..95d70e0 100644 --- a/vase.py +++ b/vase.py @@ -12,36 +12,53 @@ from ParamChecker import ParamChecker from VcfBamScanner import VcfBamScanner from VaSeBuilder import VaSeBuilder +from VariantContextFile import VariantContextFile class VaSe: - # Performs the check that VaSe is run with Python 3.x def __init__(self): + """Checks both the python and pysam version. + + If the python version is not at least 3.6 or higher the program will not run as f-strings, that are used in this + program were not available in older versions. The program also requires pysam 0.15 or higher as some pysma + functions used in VaseBuilder are only available since pysam 0.15. + + Attributes + ---------- + valid_runmodes : list of str + Valid VaSeBuilder run modes + """ assert (sys.version_info[0] >= 3 and sys.version_info[1] >= 6), "Please run this program in Python 3.6 or " \ "higher" assert (int(pysam.version.__version__.split(".")[0]) >= 0 and int(pysam.version.__version__.split(".")[1]) >= 15), "Please run this program with Pysam 0.15 or higher" - self.valid_runmodes = ["A", "AC", "D", "DC", "F", "FC", "P", "PC", "X", "XC"] + self.valid_runmodes = ["AC", "D", "DC", "F", "FC", "P", "PC", "X"] # Runs the program. def main(self): + """Runs VaSeBuilder and performs all the work. + """ # Parse the command line parameters and check their validity. vase_arg_list = self.get_vase_parameters() pmc = ParamChecker() - # Start the logger and initialize this run with an ID number. - self.vaselogger = self.start_logger(pmc, vase_arg_list["log"], vase_arg_list["debug"]) - - vase_called_command = " ".join(sys.argv) - self.vaselogger.info(f"python {vase_called_command}") - vase_b = VaSeBuilder(uuid.uuid4().hex) # Check whether a configuration file was supplied than all required command line parameters. if vase_arg_list["configfile"] is not None: configfileloc = vase_arg_list["configfile"] vase_arg_list = self.read_config_file(configfileloc) # Set the read config file as the parameter list + # Check whether the DEBUG parameters has been set or not + if "debug" not in vase_arg_list: + vase_arg_list["debug"] = False # Check optional parameters and set those missing to None vase_arg_list = pmc.optional_parameters_set(vase_arg_list) + # Start the logger and initialize this run with an ID number. + self.vaselogger = self.start_logger(pmc, vase_arg_list["log"], vase_arg_list["debug"]) + + vase_called_command = " ".join(sys.argv) + self.vaselogger.info(f"python {vase_called_command}") + vase_b = VaSeBuilder(uuid.uuid4().hex) + # Exit if not all of the required parameters have been set if not pmc.required_parameters_set(vase_arg_list["runmode"], vase_arg_list): self.vaselogger.critical("Not all required parameters have been set") @@ -52,48 +69,13 @@ def main(self): self.vaselogger.critical("Not all required parameters are correct. Please check log for more info.") exit() - if "A" in pmc.runmode: - donor_fastq_files = self.read_donor_fastq_list_file(pmc.get_donorfqlist()) - r1_dfqs = [dfq[0] for dfq in donor_fastq_files] - r2_dfqs = [dfq[1] for dfq in donor_fastq_files] - vase_b.build_validation_from_donor_fastqs(pmc.get_first_fastq_in_location(), - pmc.get_second_fastq_in_location(), - r1_dfqs, r2_dfqs, pmc.varconin, pmc.get_fastq_out_location()) - else: - # Scan the variant and alignment files in the provided lists. - vbscan = VcfBamScanner() - vcf_file_map = vbscan.scan_vcf_files(vase_arg_list["donorvcf"]) - bam_file_map = vbscan.scan_bamcram_files(vase_arg_list["donorbam"]) - sample_id_list = vbscan.get_complete_sample_ids() + # Check if a variantfilter has been set. + variantfilter = None + if pmc.get_variant_list_location() != "": + variantfilter = self.read_variant_list(pmc.get_variant_list_location()) - # Exit if no valid pairs of variant/alignment files are found. - if not sample_id_list: - self.vaselogger.critical("No valid samples available to " - "create new validation set") - exit() - - variantfilter = None - if pmc.get_variant_list_location() != "": - variantfilter = self.read_variant_list(pmc.get_variant_list_location()) - - if "C" not in pmc.runmode: - vase_b.build_varcon_set(sample_id_list, - vcf_file_map, bam_file_map, - pmc.get_acceptor_bam(), - pmc.get_out_dir_location(), - pmc.get_reference_file_location(), - pmc.get_variant_context_out_location(), - variantfilter) - elif "C" in pmc.runmode: - vase_b.build_donor_from_varcon(pmc.varconin, - bam_file_map, - pmc.get_reference_file_location()) - if "X" not in pmc.runmode: - vase_b.build_validation_set(pmc.runmode, - pmc.get_acceptor_bam(), - pmc.get_first_fastq_in_location(), - pmc.get_second_fastq_in_location(), - pmc.get_fastq_out_location()) + # Run the selected mode. + self.run_selected_mode(pmc.get_runmode(), vase_b, pmc, variantfilter, pmc.get_random_seed_value()) self.vaselogger.info("VaSeBuilder run completed succesfully.") elapsed = time.strftime( @@ -102,9 +84,16 @@ def main(self): ) self.vaselogger.info(f"Elapsed time: {elapsed}.") - # Method that creates the logger that will write the log to stdout - # and a log file. def start_logger(self, paramcheck, logloc, debug_mode=False): + """Starts and returns the logger VaSe_Logger. + + The logger writes both to stdout and the specified logfile. + + Returns + ------- + vaselogger : Logger + Logging utility to log VaSeBuilder activity + """ vaselogger = logging.getLogger("VaSe_Logger") if debug_mode: vaselogger.setLevel(logging.DEBUG) @@ -135,31 +124,68 @@ def start_logger(self, paramcheck, logloc, debug_mode=False): vaselogger.addHandler(vase_file_handler) return vaselogger - # Returns the vase Parameters. def get_vase_parameters(self): + """Creates a command line argument parser and returns the parameter values. + + Returns + ------- + vase_args : dict + Command line parameters and set values + """ # Set the VaSe parameters for the program. vase_argpars = argparse.ArgumentParser() - vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes, help="RUNMODE HELP") - vase_argpars.add_argument("-v", "--donorvcf", dest="donorvcf", help="File containing a list of VCF/VCF.GZ/BCF files.") + vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes, + help="RUNMODE HELP") + vase_argpars.add_argument("-v", "--donorvcf", dest="donorvcf", + help="File containing a list of VCF/VCF.GZ/BCF files.") vase_argpars.add_argument("-b", "--donorbam", dest="donorbam", help="File containing a list of BAM/CRAM files.") - vase_argpars.add_argument("-a", "--acceptorbam", dest="acceptorbam", help="BAM file for identifying acceptor reads to exclude.") - vase_argpars.add_argument("-1", "--templatefq1", dest="templatefq1", nargs="*", help="Location and name of the first fastq in file.") - vase_argpars.add_argument("-2", "--templatefq2", dest="templatefq2", nargs="*", help="Location and name of the second fastq in file.") + vase_argpars.add_argument("-a", "--acceptorbam", dest="acceptorbam", + help="BAM file for identifying acceptor reads to exclude.") + vase_argpars.add_argument("-1", "--templatefq1", dest="templatefq1", nargs="*", + help="Location and name of the first fastq in file.") + vase_argpars.add_argument("-2", "--templatefq2", dest="templatefq2", nargs="*", + help="Location and name of the second fastq in file.") vase_argpars.add_argument("-o", "--out", dest="out", help="Directory to write output files to.") - vase_argpars.add_argument("-r", "--reference", dest="reference", help="Location of the reference genome. This reference genome should be used by all VCF/BCF and BAM/CRAM files.") - vase_argpars.add_argument("-of", "--fastqout", dest="fastqout", help="Name for the two FastQ files to be produced.") - vase_argpars.add_argument("-ov", "--varcon", dest="varcon", help="File name to write variants and their contexts to.") - vase_argpars.add_argument("-l", "--log", dest="log", help="Location to write log files to (will write to working directory if not used).") - vase_argpars.add_argument("-!", "--debug", dest="debug", action="store_true", help="Run the program in debug mode") - vase_argpars.add_argument("-vl", "--variantlist", dest="variantlist", help="File containing a list of variants to use. Will only use these variants if provided. Will use all variants if no list is provided.") - vase_argpars.add_argument("-iv", "--varconin", dest="varconin", help="Provide a Vasebuilder output variant context file to build a validation set.") - vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", help="Location to donor fastq list file") + vase_argpars.add_argument("-r", "--reference", dest="reference", + help="Location of the reference genome. This reference genome should be used by all" + "VCF/BCF and BAM/CRAM files.") + vase_argpars.add_argument("-of", "--fastqout", dest="fastqout", + help="Name for the two FastQ files to be produced.") + vase_argpars.add_argument("-ov", "--varcon", dest="varcon", + help="File name to write variants and their contexts to.") + vase_argpars.add_argument("-l", "--log", dest="log", + help="Location to write log files to (will write to working directory if not used).") + vase_argpars.add_argument("-!", "--debug", dest="debug", action="store_true", + help="Run the program in debug mode") + vase_argpars.add_argument("-vl", "--variantlist", dest="variantlist", + help="File containing a list of variants to use. Will only use these variants if " + "provided. Will use all variants if no list is provided.") + vase_argpars.add_argument("-iv", "--varconin", dest="varconin", + help="Provide a Vasebuilder output variant context file to build a validation set.") + vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", + help="Location to donor fastq list file") vase_argpars.add_argument("-c", "--config", dest="configfile", help="Supply a config file") + vase_argpars.add_argument("-s", "--seed", dest="seed", default=2, type=int, + help="Set seed for semi randomly distributing donor reads") vase_args = vars(vase_argpars.parse_args()) return vase_args - # Reads the variant list. Assumes that sampleId, chromosome and startpos are columns 1,2 and 3 def read_variant_list(self, variantlistloc): + """Reads a file containing genomic variants and returns them in a dictionary. + + The file containing the variant is expected to have at least three columns separated by tabs. These should be, + in order: sample name, chromosome name, chromosomal position. + + Parameters + ---------- + variantlistloc : str + The location of the file containing variants + + Returns + ------- + dict + Read variants per sample name + """ variant_filter_list = {} try: with open(variantlistloc) as variantlistfile: @@ -174,8 +200,20 @@ def read_variant_list(self, variantlistloc): finally: return variant_filter_list - # Reads the config file with the settings def read_config_file(self, configfileloc): + """Reads a VaSeBuilder configuration file and returns the parameter values. + + Parameters + ---------- + configfileloc : str + Path to the VaSeBuilder configuration file + + Returns + ------- + configdata : dict + Read parameters and values + """ + debug_param_vals = ["True", "1", "T"] configdata = {} try: with open(configfileloc, "r") as configfile: @@ -191,6 +229,11 @@ def read_config_file(self, configfileloc): if parameter_name == "templatefq1" or parameter_name == "templatefq2": template_files = parameter_value.split(",") configdata[parameter_name] = [tmplfile.strip() for tmplfile in template_files] + # Check if the parameter is the debug parameter + elif parameter_name == "debug": + configdata[parameter_name] = parameter_value.title() in debug_param_vals + elif parameter_name == "seed": + configdata[parameter_name] = int(parameter_value) else: configdata[parameter_name] = parameter_value.strip() except IOError: @@ -199,6 +242,19 @@ def read_config_file(self, configfileloc): # Reads a list of donor fastq files in the format (R1.fq\tR2.fq) def read_donor_fastq_list_file(self, donorfq_listfileloc): + """Reads a file with a list of donor fastq files + + The donor fastq list file is expected to have two columns. The first column should contain the paths to R1 files + and the second column paths to R2 files. For each sample two fastq files are expected, + + Parameters + ---------- + donorfq_listfileloc : str + Path to the donor fastq list file + + Returns + ------- + """ donor_fastqs = [] try: with open(donorfq_listfileloc) as donorfqlistfile: @@ -209,6 +265,72 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc): finally: return donor_fastqs + def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter, randomseed): + """Selects and runs the selected run mode. + + Depending on the specified run mode either a full validation set of fastq files or fastq files containing only + donor data are produced. If the run mode contains a 'C' an existing variant context file will be read, + otherwise it will be build. + + Parameters + ---------- + runmode : str + + vaseb : VaSeBuilder + VaSeBuilder object that will perform the actions + paramcheck : ParamChecker + Utility tha checks whether the parameters are ok + variantfilter + """ + vbscan = VcfBamScanner() + varconfile = None # Declare the varconfile variable so we can use it in C and no-C. + + if "X" in runmode: + vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) + bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) + sample_id_list = vbscan.get_complete_sample_ids() + vaseb.run_x_mode(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), + paramcheck.get_reference_file_location(), paramcheck.get_out_dir_location(), + paramcheck.get_variant_context_out_location(), variantfilter) + else: + if "C" in runmode: + # Read an existing variant context file + varconfile = VariantContextFile(paramcheck.get_variantcontext_infile()) + if "A" in runmode: + donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) + vaseb.run_ac_mode_v2(paramcheck.get_first_fastq_in_location(), + paramcheck.get_second_fastq_in_location(), + donor_fastq_files, varconfile, randomseed, paramcheck.get_fastq_out_location()) + return + # Refetch the donor reads required when runmode (D,F,P) contains a 'C' + bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) + vaseb.refetch_donor_reads(varconfile, bam_file_map, paramcheck.get_reference_file_location()) + else: + # Scan the variant and alignment files in the provided lists. + vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) + bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) + sample_id_list = vbscan.get_complete_sample_ids() + # varconfile = vaseb.build_varcon_set(sample_id_list, vcf_file_map, bam_file_map, + # paramcheck.get_acceptor_bam(), paramcheck.get_out_dir_location(), + # paramcheck.get_reference_file_location(), + # paramcheck.get_variant_context_out_location(), variantfilter) + varconfile = vaseb.bvcs(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), + paramcheck.get_out_dir_location(), paramcheck.get_reference_file_location(), + paramcheck.get_variant_context_out_location(), variantfilter) + + # Check whether contexts were created + if varconfile is None: + return + + # Check for modes D,F,P + if "D" in runmode: + vaseb.run_d_mode(varconfile, paramcheck.get_fastq_out_location()) + if "F" in runmode: + vaseb.run_f_mode(varconfile, paramcheck.get_first_fastq_in_location(), + paramcheck.get_second_fastq_in_location(), paramcheck.get_fastq_out_location()) + if "P" in runmode: + vaseb.run_p_mode(varconfile, paramcheck.get_out_dir_location(), paramcheck.get_fastq_out_location()) + # Run the program. vaseb = VaSe()