From 094659b44d03a7114484387ad03ac0b247666ad8 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 20 Aug 2019 10:59:00 +0200 Subject: [PATCH 01/90] Start restructuring code to easier understand different runmodes --- ParamChecker.py | 4 ++++ VaSeBuilder.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++-- vase.py | 38 +++++++++++++++++++++++++----- 3 files changed, 96 insertions(+), 8 deletions(-) diff --git a/ParamChecker.py b/ParamChecker.py index 4750253..4338f82 100644 --- a/ParamChecker.py +++ b/ParamChecker.py @@ -312,3 +312,7 @@ def get_variant_list_location(self): # Returns the location of the list file containing donor fastq files def get_donorfqlist(self): return self.donorfqlist + + # Returns the specified runmmode + def get_runmode(self): + return self.runmode diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 841e13b..6600072 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1108,15 +1108,18 @@ def divide_remaining_donors(self, split_donorfqlist): add_to_index = 0 return split_donorfqlist - # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING + # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING DONOR FASTQS def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toexclude, forward_reverse, outpath): donor_sets = self.divide_donorfastqs_over_acceptors(donor_fqin, len(acceptor_fqin)) donorids = set() + + self.vaselogger.info(f"Start building validation R{forward_reverse} fastq files") for x in range(len(acceptor_fqin)): fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x+1) fqoutfile = open(fqoutname, "w") # Start writing the filtered acceptor file to a new output file + self.vaselogger.info(f"Start building fastq file {fqoutname}") try: acceptorfastq = open(acceptor_fqin[x], "r") for fileline in acceptorfastq: @@ -1139,7 +1142,62 @@ def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toe except IOError: self.vaselogger.critical(f"Acceptor fastq {acceptor_fqin[x]} could not be opened") - # Add the donor fastq files + # Add the donor fastq files. + self.vaselogger.info(f"Add {len(donor_sets[x])} donor fastq files to {fqoutname}") for donorfastqfile in donor_sets[x]: + self.vaselogger.debug(f"Adding {donorfastqfile} to {fqoutname}") self.add_donor_fastq3(fqoutfile, donorfastqfile, donorids) fqoutfile.close() + + # Runs one or more specified VaSeBuilder modes + def run_mode(self, runmode, runmode_parameters): + if "A" in runmode: + self.run_a_mode(runmode_parameters) + if "D" in runmode: + self.run_d_mode(runmode, runmode_parameters) + if "F" in runmode: + self.run_f_mode(runmode, runmode_parameters) + if "P" in runmode: + self.run_p_mode(runmode, runmode_parameters) + if "X" in runmode: + self.run_x_mode(runmode_parameters) + + # A-mode: Filters acceptors and adds already existing donor fastq files + def run_a_mode(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath): + self.vaselogger.info("Running VaeBuilder A-mode") + varconfile = VariantContextFile(varconfileloc) + skip_list = varconfile.get_all_variant_context_acceptor_read_ids() + skip_list.sort() + for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]): + self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) + + # D-mode: Create/Read variant contexts and output only donor fastq files + def run_d_mode(self, runmode, runmode_parameters): + # Checks whether + if "C" not in runmode: + print("Building the variant context set") + else: + print("Read the variant context set from an existing varcon file") + print("Write the donor fastq files") + + # F-mode: Create/Read variant contexts and produce validation fastq files + def run_f_mode(self, runmode, runmode_parameters): + # Checks whether + if "C" not in runmode: + print("Building the variant context set") + else: + print("Read the variant context set from an existing varcon file") + print("Write the new validation set") + + # P-mode: Create/Read variant contexts and produce donor fastq files for each variant context + def run_p_mode(self, runmode, runmode_parameters): + # Checks whether + if "C" not in runmode: + print("Building the variant context set") + else: + print("Read the variant context set from an existing varcon file") + print("Write the donor fastqs for each variant context") + + # X-mode: Only create variant contexts. + def run_x_mode(self, runmode_parameters): + print("Call/Implement the code for X-mode") diff --git a/vase.py b/vase.py index 439cbd9..2c8887a 100644 --- a/vase.py +++ b/vase.py @@ -28,20 +28,24 @@ def main(self): # Parse the command line parameters and check their validity. vase_arg_list = self.get_vase_parameters() pmc = ParamChecker() - # Start the logger and initialize this run with an ID number. - self.vaselogger = self.start_logger(pmc, vase_arg_list["log"], vase_arg_list["debug"]) - - vase_called_command = " ".join(sys.argv) - self.vaselogger.info(f"python {vase_called_command}") - vase_b = VaSeBuilder(uuid.uuid4().hex) # Check whether a configuration file was supplied than all required command line parameters. if vase_arg_list["configfile"] is not None: configfileloc = vase_arg_list["configfile"] vase_arg_list = self.read_config_file(configfileloc) # Set the read config file as the parameter list + # Check whether the DEBUG parameters has been set or not + if "debug" not in vase_arg_list: + vase_arg_list["debug"] = False # Check optional parameters and set those missing to None vase_arg_list = pmc.optional_parameters_set(vase_arg_list) + # Start the logger and initialize this run with an ID number. + self.vaselogger = self.start_logger(pmc, vase_arg_list["log"], vase_arg_list["debug"]) + + vase_called_command = " ".join(sys.argv) + self.vaselogger.info(f"python {vase_called_command}") + vase_b = VaSeBuilder(uuid.uuid4().hex) + # Exit if not all of the required parameters have been set if not pmc.required_parameters_set(vase_arg_list["runmode"], vase_arg_list): self.vaselogger.critical("Not all required parameters have been set") @@ -52,6 +56,9 @@ def main(self): self.vaselogger.critical("Not all required parameters are correct. Please check log for more info.") exit() + # Run the selected mode. + self.run_mode(vase_b, pmc) + if "A" in pmc.runmode: donor_fastq_files = self.read_donor_fastq_list_file(pmc.get_donorfqlist()) r1_dfqs = [dfq[0] for dfq in donor_fastq_files] @@ -176,6 +183,8 @@ def read_variant_list(self, variantlistloc): # Reads the config file with the settings def read_config_file(self, configfileloc): + multival_parameters = ["runmode", "templatefq1", "templatefq2"] + debug_param_vals = ["True", "1", "T"] configdata = {} try: with open(configfileloc, "r") as configfile: @@ -191,6 +200,9 @@ def read_config_file(self, configfileloc): if parameter_name == "templatefq1" or parameter_name == "templatefq2": template_files = parameter_value.split(",") configdata[parameter_name] = [tmplfile.strip() for tmplfile in template_files] + # Check if the parameter is the debug parameter + elif parameter_name == "debug": + configdata[parameter_name] = parameter_value.title() in debug_param_vals else: configdata[parameter_name] = parameter_value.strip() except IOError: @@ -209,6 +221,20 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc): finally: return donor_fastqs + # Runs one or more specified VaSeBuilder modes + def run_mode(self, vaseb, paramcheck): + if "A" in paramcheck.get_runmode(): + vaseb.run_a_mode(paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), + paramcheck) + if "D" in paramcheck.get_runmode(): + vaseb.run_d_mode() + if "F" in paramcheck.get_runmode(): + vaseb.run_f_mode() + if "P" in paramcheck.get_runmode(): + vaseb.run_p_mode() + if "X" in paramcheck.get_runmode(): + vaseb.run_x_mode() + # Run the program. vaseb = VaSe() From e591baf8b0100a48f54a5ca8d61a0e24cf683cb7 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 22 Aug 2019 10:48:04 +0200 Subject: [PATCH 02/90] Make separate methods for each VaSeBuilder run mode --- ParamChecker.py | 30 ++++++++++++++++++----- VaSeBuilder.py | 63 ++++++++++++++++++++++++++++--------------------- vase.py | 40 +++++++++++++++++++++++++++++-- 3 files changed, 98 insertions(+), 35 deletions(-) diff --git a/ParamChecker.py b/ParamChecker.py index c35a81b..811eec9 100644 --- a/ParamChecker.py +++ b/ParamChecker.py @@ -21,9 +21,7 @@ def __init__(self): self.runmode = "" self.varconin = "" self.donorfqlist = "" - self.required_mode_parameters = {"A": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", - "out"], - "AC": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", + self.required_mode_parameters = {"AC": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", "out"], "D": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"], "DC": ["runmode", "donorvcf", "donorbam", "out", "reference", @@ -35,9 +33,7 @@ def __init__(self): "P": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"], "PC": ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"], - "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"], - "XC": ["runmode", "donorvcf", "donorbam", "out", "reference", - "varconin"]} + "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]} self.optional_parameters = ["fastqout", "varcon", "variantlist"] # Checks whether the required parameters for a run mode have been set. @@ -316,3 +312,25 @@ def get_donorfqlist(self): # Returns the specified runmmode def get_runmode(self): return self.runmode + + # Returns the location of the variant context in file + def get_variantcontext_infile(self): + return self.varconin + + # Returns the list of required parameters for a specified runmode + def get_required_runmode_parameters(self, runmode): + if runmode.upper() in self.required_mode_parameters: + return self.required_mode_parameters[runmode.upper()] + return [] + + # Returns the optional parameters + def get_optional_parameters(self): + return self.optional_parameters + + # Returns the + def filter_provided_parameters(self, runmode, vase_parameters): + filtered_parameter_set = {} + runmode_reqparams = self.get_optional_parameters() + if runmode in self.required_mode_parameters: + runmode_reqparams.extend(self.required_mode_parameters[runmode]) + return filtered_parameter_set diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 1c45275..1224d68 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1082,33 +1082,42 @@ def run_a_mode(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath) for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]): self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) + # A-mode: Filters acceptors and adds already existing donor fastq files + def run_a_mode2(self, afq1_in, afq2_in, dfqs, varconfile, outpath, fastqoutpath): + self.vaselogger.info("Running VaSeBuilder A-mode") + # Split the donor fastqs into an R1 and R2 group + r1_dfqs = [dfq[0] for dfq in dfqs] + r2_dfqs = [dfq[1] for dfq in dfqs] + + skip_list = varconfile.get_all_variant_context_acceptor_read_ids() + skip_list.sort() + + for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [r1_dfqs, r2_dfqs]): + self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) + # D-mode: Create/Read variant contexts and output only donor fastq files - def run_d_mode(self, runmode, runmode_parameters): - # Checks whether - if "C" not in runmode: - print("Building the variant context set") - else: - print("Read the variant context set from an existing varcon file") - print("Write the donor fastq files") - - # F-mode: Create/Read variant contexts and produce validation fastq files - def run_f_mode(self, runmode, runmode_parameters): - # Checks whether - if "C" not in runmode: - print("Building the variant context set") - else: - print("Read the variant context set from an existing varcon file") - print("Write the new validation set") - - # P-mode: Create/Read variant contexts and produce donor fastq files for each variant context - def run_p_mode(self, runmode, runmode_parameters): - # Checks whether - if "C" not in runmode: - print("Building the variant context set") - else: - print("Read the variant context set from an existing varcon file") - print("Write the donor fastqs for each variant context") + def run_d_mode(self, variantcontxtfile): + self.vaselogger.info("Running VaSeBuilder D-mode") + + # F-mode: Produce complete validation fastq files + def run_f_mode(self, variantcontexts, acceptorfq1, acceptorfq2, fastqoutpath): + self.vaselogger.info("Running VaSeBuilder F-mode") + print("Do all the F-mode work") + + # P-mode: Produce donor fastq files for each variant context + def run_p_mode(self, variantcontexts, fq_out): + self.vaselogger.info("Running VaSeBuilder P-mode") + self.vaselogger.info("Begin writing variant FastQ files.") + for context in variantcontexts.values(): + add_list = context.get_donor_read_strings() + self.vaselogger.debug(f"Writing variant FastQs for variant {context.context_id}.") + self.build_donor_fq(add_list, "1", fq_out + context.context_id) + self.build_donor_fq(add_list, "2", fq_out + context.context_id) + self.vaselogger.info("Finished writing variant FastQ files.") # X-mode: Only create variant contexts. - def run_x_mode(self, runmode_parameters): - print("Call/Implement the code for X-mode") + def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomereference, outdir, varconout, + variantlist): + self.vaselogger.info("Running VaSeBuilder X-mode") + self.build_varcon_set(sampleidlist, donorvcfs, donorbams, acceptorbam, outdir, genomereference, varconout, + variantlist) diff --git a/vase.py b/vase.py index d811788..5b5ea5f 100644 --- a/vase.py +++ b/vase.py @@ -224,17 +224,53 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc): # Runs one or more specified VaSeBuilder modes def run_mode(self, vaseb, paramcheck): if "A" in paramcheck.get_runmode(): + donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) + r1_dfqs = [dfq[0] for dfq in donor_fastq_files] + r2_dfqs = [dfq[1] for dfq in donor_fastq_files] + vaseb.run_a_mode(paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), - paramcheck) + r1_dfqs, r2_dfqs, paramcheck.get_variantcontext_infile(), + paramcheck.get_fastq_out_location()) if "D" in paramcheck.get_runmode(): vaseb.run_d_mode() if "F" in paramcheck.get_runmode(): - vaseb.run_f_mode() + vcf_file_map = {} + bam_file_map = {} + vaseb.run_f_mode(paramcheck.get_runmode(), vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), + paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), + paramcheck.get_fastq_out_location()) if "P" in paramcheck.get_runmode(): vaseb.run_p_mode() if "X" in paramcheck.get_runmode(): vaseb.run_x_mode() + def run_selected_mode(self, runmode, vaseb, paramcheck): + if "X" in runmode: + vaseb.run_x_mode() + else: + varconfile = None # Declare the varconfile variable so we can use it in C and no-C. + if "C" in runmode: + varconfile = VariantContextFile("") + if "A" in runmode: + donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) + r1_dfqs = [dfq[0] for dfq in donor_fastq_files] + r2_dfqs = [dfq[1] for dfq in donor_fastq_files] + + vaseb.run_a_mode(paramcheck.get_first_fastq_in_location(), + paramcheck.get_second_fastq_in_location(), + r1_dfqs, r2_dfqs, paramcheck.get_variantcontext_infile(), + paramcheck.get_fastq_out_location()) + else: + varconfile = vaseb.build_varcon_set() + + # Check for modes D,F,P + if "D" in runmode: + vaseb.run_d_mode(varconfile, paramcheck) + if "F" in runmode: + vaseb.run_f_mode(varconfile) + if "P" in runmode: + vaseb.run_p_mode(varconfile) + # Run the program. vaseb = VaSe() From 52b068394255752ba336ac378978c58d62f782ac Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 22 Aug 2019 13:33:48 +0200 Subject: [PATCH 03/90] Implement the different run modes (not tested yet) --- VaSeBuilder.py | 30 +++++++++++++++++++++++++++--- vase.py | 11 +++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 1224d68..690646f 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1096,13 +1096,37 @@ def run_a_mode2(self, afq1_in, afq2_in, dfqs, varconfile, outpath, fastqoutpath) self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) # D-mode: Create/Read variant contexts and output only donor fastq files - def run_d_mode(self, variantcontxtfile): + def run_d_mode(self, variantcontextfile, fq_out): self.vaselogger.info("Running VaSeBuilder D-mode") + # Combine all donor reads from all variant contexts + add_list = variantcontextfile.get_all_variant_context_donor_reads() + self.vaselogger.info("Writing FastQ files.") + # Write the donor fastq files. + for i in ["1", "2"]: + self.vaselogger.info(f"Start writing the R{i} FastQ files.") + fq_starttime = time.time() + self.build_donor_fq(add_list, i, fq_out) + self.vaselogger.info(f"Wrote all R{i} FastQ files.") + self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") + self.vaselogger.info("Finished writing donor FastQ files.") # F-mode: Produce complete validation fastq files - def run_f_mode(self, variantcontexts, acceptorfq1, acceptorfq2, fastqoutpath): + def run_f_mode(self, variantcontexts, fq1_in, fq2_in, fq_out): self.vaselogger.info("Running VaSeBuilder F-mode") - print("Do all the F-mode work") + + # Combine all donor reads from all variant contexts + add_list = self.contexts.get_all_variant_context_donor_reads() + self.vaselogger.info("Writing FastQ files.") + skip_list = set(self.contexts.get_all_variant_context_acceptor_read_ids()) + + for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): + # Write the fastq files. + self.vaselogger.info(f"Start writing the R{i} FastQ files.") + fq_starttime = time.time() + self.build_fastq(fq_i, skip_list, add_list, i, fq_out) + self.vaselogger.info(f"Wrote all R{i} FastQ files.") + self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") + self.vaselogger.info("Finished writing FastQ files.") # P-mode: Produce donor fastq files for each variant context def run_p_mode(self, variantcontexts, fq_out): diff --git a/vase.py b/vase.py index 5b5ea5f..c80d323 100644 --- a/vase.py +++ b/vase.py @@ -244,7 +244,7 @@ def run_mode(self, vaseb, paramcheck): if "X" in paramcheck.get_runmode(): vaseb.run_x_mode() - def run_selected_mode(self, runmode, vaseb, paramcheck): + def run_selected_mode(self, runmode, vaseb, paramcheck, variantlist_filter): if "X" in runmode: vaseb.run_x_mode() else: @@ -261,7 +261,14 @@ def run_selected_mode(self, runmode, vaseb, paramcheck): r1_dfqs, r2_dfqs, paramcheck.get_variantcontext_infile(), paramcheck.get_fastq_out_location()) else: - varconfile = vaseb.build_varcon_set() + # Scan the variant and alignment files in the provided lists. + vbscan = VcfBamScanner() + vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) + bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) + sample_id_list = vbscan.get_complete_sample_ids() + varconfile = vaseb.build_varcon_set(sample_id_list, vcf_file_map, bam_file_map, + paramcheck.get_acceptor_bam(), paramcheck.get_out_dir_location(), + paramcheck.get_reference_file_location(), ) # Check for modes D,F,P if "D" in runmode: From 50932d99d13b3f6efd69b65b24b8206f8637e616 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 23 Aug 2019 16:03:39 +0200 Subject: [PATCH 04/90] Updates to run mode implementation, add temporary methods to split build_varcon_set in smaller methods --- VaSeBuilder.py | 106 ++++++++++++++++++++++++++++++++------------- vase.py | 115 +++++++++++++------------------------------------ 2 files changed, 104 insertions(+), 117 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 690646f..aac5433 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -25,8 +25,7 @@ def __init__(self, vaseid): f"VaSeBuilder: {self.creation_id} ; {self.creation_time}" ) - # VariantContextFile that saves the acceptor, donor, - # and variant contexts with their associated data. + # VariantContextFile that saves the acceptor, donor, and variant contexts with their associated data. self.contexts = VariantContextFile() # Dictionary used for debug messages. @@ -135,34 +134,25 @@ def build_varcon_set(self, sampleidlist, vchr = vcfvar.get_variant_chrom() vpos = vcfvar.get_variant_pos() - # Determine whether the variant is SNP or indel, - # based on the length of its alleles. + # Determine whether the variant is SNP or indel, based on the length of its alleles. self.debug_msg("vw", variantid) vtype = vcfvar.get_variant_type() - - self.vaselogger.debug(f"Variant {variantid} " - f"determined to be a {vtype}.") + self.vaselogger.debug(f"Variant {variantid} determined to be a {vtype}.") # Determine the length of the search window # based on the length of the variant. searchwindow = self.determine_read_search_window(vtype, vcfvar) - self.vaselogger.debug( - "Search window determined to be " - f"{vchr}:{searchwindow[0]+1}-{searchwindow[1]}." - ) + self.vaselogger.debug(f"Search window determined to be {vchr}:{searchwindow[0]+1}-{searchwindow[1]}.") - # Check if this window overlaps any already in-use contexts; - # If so, skip this variant. + # Check if this window overlaps any already in-use contexts; If so, skip this variant. if self.contexts.variant_is_in_context(vtype, vchr, *searchwindow): - self.vaselogger.debug(f"Variant {variantid} is located in " - "an existing variant context.") + self.vaselogger.debug(f"Variant {variantid} is located in an existing variant context.") continue # Begin fetching reads and determining contexts. try: # === DONOR ===================================== - # Obtain all donor BAM reads at the variant - # position, as well as their mates. + # Obtain all donor BAM reads at the variant position, as well as their mates. self.debug_msg("dr", variantid) t0 = time.time() donor_context_reads = ( @@ -1060,19 +1050,6 @@ def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toe self.add_donor_fastq3(fqoutfile, donorfastqfile, donorids) fqoutfile.close() - # Runs one or more specified VaSeBuilder modes - def run_mode(self, runmode, runmode_parameters): - if "A" in runmode: - self.run_a_mode(runmode_parameters) - if "D" in runmode: - self.run_d_mode(runmode, runmode_parameters) - if "F" in runmode: - self.run_f_mode(runmode, runmode_parameters) - if "P" in runmode: - self.run_p_mode(runmode, runmode_parameters) - if "X" in runmode: - self.run_x_mode(runmode_parameters) - # A-mode: Filters acceptors and adds already existing donor fastq files def run_a_mode(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath): self.vaselogger.info("Running VaeBuilder A-mode") @@ -1083,7 +1060,7 @@ def run_a_mode(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath) self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) # A-mode: Filters acceptors and adds already existing donor fastq files - def run_a_mode2(self, afq1_in, afq2_in, dfqs, varconfile, outpath, fastqoutpath): + def run_ac_mode(self, afq1_in, afq2_in, dfqs, varconfile, outpath): self.vaselogger.info("Running VaSeBuilder A-mode") # Split the donor fastqs into an R1 and R2 group r1_dfqs = [dfq[0] for dfq in dfqs] @@ -1145,3 +1122,70 @@ def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomerefe self.vaselogger.info("Running VaSeBuilder X-mode") self.build_varcon_set(sampleidlist, donorvcfs, donorbams, acceptorbam, outdir, genomereference, varconout, variantlist) + + # =====SPLITTING THE BUILD_VARCON_SET INTO MULTIPLE SMALLER METHODS===== + def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath, + variant_list): + self.vaselogger.info("Begin building the variant context file.") + donor_vcfs_used = [] + donor_bams_used = [] + + # Open the acceptor bam, or exit if this fails. + try: + acceptorbamfile = pysam.AlignmentFile(acceptorbamloc, reference_filename=reference_loc) + except IOError: + self.vaselogger.critical("Could not open acceptor BAM file. Exitting.") + exit() + + # Iterate over the provided samples. + for sampleid in sampleidlist: + self.vaselogger.debug(f"Processing data for sample {sampleid}.") + + # Only include variants from VCF files if they are included in the provided (optional) variant list. + sample_variant_filter = self.bvcs_set_variant_filter(sampleid, variant_list) + samplevariants = self.get_sample_vcf_variants(vcfsamplemap[sampleid], sample_variant_filter) + + # Skip this sample if all of its variants got filtered out. + if not samplevariants: + self.vaselogger.warning(f"No variants obtained for sample {sampleid}. Skipping sample") + continue + + # Process all variants for the current sample + self.bvcs_process_variants() + + # Sets the variant list for a sample if applicable, otherwise return None + def bvcs_set_variant_filter(self, sampleid, variant_list): + sample_variant_filter = None + if variant_list is not None: + if sampleid in variant_list: + sample_variant_filter = variant_list[sampleid] + return sample_variant_filter + + # Establishes an acceptor/donor context by fetching reads (and their mates) overlapping directly with the variant + def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, searchwindow, bamfile, + write_unm=False, unmappedlist=None): + # Obtain all donor BAM reads at the variant position, as well as their mates. + self.vaselogger.debug(f"Searching donor reads for variant {variantid}") + t0 = time.time() + context_reads = (self.get_variant_reads(variantid, variantchrom, *searchwindow, bamfile, write_unm, unmappedlist)) + self.vaselogger.debug("Searching donor reads for {variantid} took {time.time() - t0} seconds") + + # If no donor reads were found at this position, then skip this variant. + if not context_reads: + self.vaselogger.info(f"No reads found for variant {variantid} in donor {sampleid}; Skipping.") + return [] + + # Determine the context based on the reads overlapping the variant. + self.vaselogger.debug("Determining the context for variant {variantid}") + t0 = time.time() + adcontext = (self.determine_context(context_reads, variantpos, variantchrom)) + self.vaselogger.debug("Determining the context for {variantid} took {time.time() - t0} seconds") + return adcontext + + # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. + def bvcs_estabish_variant_context(self): + + + # def bvcs_process_variant(self): + # def bvcs_process_variants(self): + # def bvcs_write_output_files(self): diff --git a/vase.py b/vase.py index c80d323..37b1031 100644 --- a/vase.py +++ b/vase.py @@ -12,6 +12,7 @@ from ParamChecker import ParamChecker from VcfBamScanner import VcfBamScanner from VaSeBuilder import VaSeBuilder +from VariantContextFile import VariantContextFile class VaSe: @@ -56,51 +57,13 @@ def main(self): self.vaselogger.critical("Not all required parameters are correct. Please check log for more info.") exit() - # Run the selected mode. - self.run_mode(vase_b, pmc) - - if "A" in pmc.runmode: - donor_fastq_files = self.read_donor_fastq_list_file(pmc.get_donorfqlist()) - r1_dfqs = [dfq[0] for dfq in donor_fastq_files] - r2_dfqs = [dfq[1] for dfq in donor_fastq_files] - vase_b.build_validation_from_donor_fastqs(pmc.get_first_fastq_in_location(), - pmc.get_second_fastq_in_location(), - r1_dfqs, r2_dfqs, pmc.varconin, pmc.get_fastq_out_location()) - else: - # Scan the variant and alignment files in the provided lists. - vbscan = VcfBamScanner() - vcf_file_map = vbscan.scan_vcf_files(vase_arg_list["donorvcf"]) - bam_file_map = vbscan.scan_bamcram_files(vase_arg_list["donorbam"]) - sample_id_list = vbscan.get_complete_sample_ids() - - # Exit if no valid pairs of variant/alignment files are found. - if not sample_id_list: - self.vaselogger.critical("No valid samples available to " - "create new validation set") - exit() - - variantfilter = None - if pmc.get_variant_list_location() != "": - variantfilter = self.read_variant_list(pmc.get_variant_list_location()) + # Check if a variantfilter has been set. + variantfilter = None + if pmc.get_variant_list_location() != "": + variantfilter = self.read_variant_list(pmc.get_variant_list_location()) - if "C" not in pmc.runmode: - vase_b.build_varcon_set(sample_id_list, - vcf_file_map, bam_file_map, - pmc.get_acceptor_bam(), - pmc.get_out_dir_location(), - pmc.get_reference_file_location(), - pmc.get_variant_context_out_location(), - variantfilter) - elif "C" in pmc.runmode: - vase_b.build_donor_from_varcon(pmc.varconin, - bam_file_map, - pmc.get_reference_file_location()) - if "X" not in pmc.runmode: - vase_b.build_validation_set(pmc.runmode, - pmc.get_acceptor_bam(), - pmc.get_first_fastq_in_location(), - pmc.get_second_fastq_in_location(), - pmc.get_fastq_out_location()) + # Run the selected mode. + self.run_selected_mode(pmc.get_runmode(), vase_b, pmc, variantfilter) self.vaselogger.info("VaSeBuilder run completed succesfully.") elapsed = time.strftime( @@ -109,8 +72,7 @@ def main(self): ) self.vaselogger.info(f"Elapsed time: {elapsed}.") - # Method that creates the logger that will write the log to stdout - # and a log file. + # Method that creates the logger that will write the log to stdout and a log file. def start_logger(self, paramcheck, logloc, debug_mode=False): vaselogger = logging.getLogger("VaSe_Logger") if debug_mode: @@ -183,7 +145,6 @@ def read_variant_list(self, variantlistloc): # Reads the config file with the settings def read_config_file(self, configfileloc): - multival_parameters = ["runmode", "templatefq1", "templatefq2"] debug_param_vals = ["True", "1", "T"] configdata = {} try: @@ -221,62 +182,44 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc): finally: return donor_fastqs - # Runs one or more specified VaSeBuilder modes - def run_mode(self, vaseb, paramcheck): - if "A" in paramcheck.get_runmode(): - donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) - r1_dfqs = [dfq[0] for dfq in donor_fastq_files] - r2_dfqs = [dfq[1] for dfq in donor_fastq_files] - - vaseb.run_a_mode(paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), - r1_dfqs, r2_dfqs, paramcheck.get_variantcontext_infile(), - paramcheck.get_fastq_out_location()) - if "D" in paramcheck.get_runmode(): - vaseb.run_d_mode() - if "F" in paramcheck.get_runmode(): - vcf_file_map = {} - bam_file_map = {} - vaseb.run_f_mode(paramcheck.get_runmode(), vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), - paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), - paramcheck.get_fastq_out_location()) - if "P" in paramcheck.get_runmode(): - vaseb.run_p_mode() - if "X" in paramcheck.get_runmode(): - vaseb.run_x_mode() + def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): + vbscan = VcfBamScanner() + varconfile = None # Declare the varconfile variable so we can use it in C and no-C. - def run_selected_mode(self, runmode, vaseb, paramcheck, variantlist_filter): if "X" in runmode: - vaseb.run_x_mode() + vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) + bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) + sample_id_list = vbscan.get_complete_sample_ids() + vaseb.run_x_mode(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), + paramcheck.get_reference_file_location(), paramcheck.get_out_dir_location(), + paramcheck.get_variant_context_out_location(), variantfilter) else: - varconfile = None # Declare the varconfile variable so we can use it in C and no-C. if "C" in runmode: - varconfile = VariantContextFile("") + varconfile = VariantContextFile(paramcheck.get_variantcontext_infile()) if "A" in runmode: donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) - r1_dfqs = [dfq[0] for dfq in donor_fastq_files] - r2_dfqs = [dfq[1] for dfq in donor_fastq_files] - - vaseb.run_a_mode(paramcheck.get_first_fastq_in_location(), - paramcheck.get_second_fastq_in_location(), - r1_dfqs, r2_dfqs, paramcheck.get_variantcontext_infile(), - paramcheck.get_fastq_out_location()) + vaseb.run_ac_mode(paramcheck.get_first_fastq_in_location(), + paramcheck.get_second_fastq_in_location(), + donor_fastq_files, varconfile, paramcheck.get_out_dir_location()) else: # Scan the variant and alignment files in the provided lists. - vbscan = VcfBamScanner() vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) sample_id_list = vbscan.get_complete_sample_ids() varconfile = vaseb.build_varcon_set(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), paramcheck.get_out_dir_location(), - paramcheck.get_reference_file_location(), ) - + paramcheck.get_reference_file_location(), + paramcheck.get_variant_context_out_location(), variantfilter) # Check for modes D,F,P if "D" in runmode: - vaseb.run_d_mode(varconfile, paramcheck) + vaseb.run_d_mode(varconfile, paramcheck.get_fastq_out_location()) if "F" in runmode: - vaseb.run_f_mode(varconfile) + vaseb.run_f_mode(varconfile, paramcheck.get_first_fastq_in_location(), + paramcheck.get_second_fastq_in_location(), paramcheck.get_fastq_out_location()) if "P" in runmode: - vaseb.run_p_mode(varconfile) + vaseb.run_p_mode(varconfile, paramcheck.get_fastq_out_location()) + + def check_ # Run the program. From d9f62d1d7048994de26a7147bc55b19dad472d48 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 26 Aug 2019 11:16:55 +0200 Subject: [PATCH 05/90] Split the get_variant_reads methode into three separate methods --- VaSeBuilder.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index aac5433..fd94686 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1123,6 +1123,44 @@ def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomerefe self.build_varcon_set(sampleidlist, donorvcfs, donorbams, acceptorbam, outdir, genomereference, varconout, variantlist) + # =====SPLITTING THE GET_VARIANT_READS() INTO THREE SMALLER METHODS===== + def get_variant_reads_alt(self, variantchrom, variantstart, variantend, bamfile, write_unm=False, umatelist=None): + variantreads = [] + rpnext = {} + for vread in bamfile.fetch(variantchrom, variantstart, variantend): + variantreads.append(DonorBamRead(vread.query_name, self.get_read_pair_num(vread), vread.reference_name, + vread.reference_start, vread.infer_read_length(), + vread.get_forward_sequence(), + "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), + vread.mapping_quality)) + rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] + self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) + variantreads = self.uniqify_variant_reads(variantreads) + return variantreads + + # Fetches the read mates for a set of reads from an BAM/CRAM alignment file. + def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): + for read1 in rpnextmap.values(): + materead = self.fetch_mate_read(*read1, bamfile) + if materead is not None: + variantreadlist.append(materead) + else: + if write_unm: + self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.") + umatelist.append(read1[2]) + return variantreadlist + + # Uniqifies a list of variant reads to ensure each read only occurs once + def uniqify_variant_reads(self, variantreads): + unique_variantreads = [] + checklist = [] + for fetched in variantreads: + id_pair = (fetched.get_bam_read_id(), fetched.get_bam_read_pair_number()) + if id_pair not in checklist: + unique_variantreads.append(fetched) + checklist.append(id_pair) + return unique_variantreads + # =====SPLITTING THE BUILD_VARCON_SET INTO MULTIPLE SMALLER METHODS===== def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath, variant_list): From 8beb17717801e120afdf36fc255483202bbfb63f Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 26 Aug 2019 13:05:49 +0200 Subject: [PATCH 06/90] remove incorrect function declaration from vase, add methods to VaSeBuilder --- VaSeBuilder.py | 38 +++++++++++++++++++++++++++++++++++--- vase.py | 2 -- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index fd94686..8941e12 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1134,7 +1134,7 @@ def get_variant_reads_alt(self, variantchrom, variantstart, variantend, bamfile, "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), vread.mapping_quality)) rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] - self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) + variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) variantreads = self.uniqify_variant_reads(variantreads) return variantreads @@ -1161,7 +1161,7 @@ def uniqify_variant_reads(self, variantreads): checklist.append(id_pair) return unique_variantreads - # =====SPLITTING THE BUILD_VARCON_SET INTO MULTIPLE SMALLER METHODS===== + # =====SPLITTING THE BUILD_VARCON_SET() INTO MULTIPLE SMALLER METHODS===== def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath, variant_list): self.vaselogger.info("Begin building the variant context file.") @@ -1221,9 +1221,41 @@ def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, return adcontext # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. - def bvcs_estabish_variant_context(self): + def bvcs_estabish_variant_context(self, variantid, variantchrom, variantpos, acontext, dcontext, abamfile, + dbamfile, write_unm=False, unmappedlist=None): + # Determine the combined variant context based on the widest window from both the donor and acceptor positions. + self.vaselogger.debug("Establishing combined variant context for variant {variantid}") + t0 = time.time() + variant_context = self.determine_largest_context(variantpos, acontext, dcontext) + self.vaselogger.debug(f"Establishing the combined variant context for variant {variantid} took " + f"{time.time() - t0} seconds") + + # If this widest context overlaps an existing variant context, skip it. + if self.contexts.context_collision(variant_context): + self.vaselogger.debug(f"Variant context {variantid} overlaps with an already existing context; Skipping.") + return [] + + # Obtain all donor reads overlapping the combined variant context, as well as their mates. + self.debug_msg("cdr", variantid) + t0 = time.time() + variant_context_donor_reads = ( + self.get_variant_reads(variantid, variant_context[0], variant_context[2], variant_context[3], dbamfile, + True, varcon_unmapped_d)) + self.debug_msg("cdr", variantid, t0) + # Obtain all acceptor reads overlapping the combined variant context, as well as their mates. + self.debug_msg("car", variantid) + t0 = time.time() + variant_context_acceptor_reads = ( + self.get_variant_reads(variantid, variant_context[0], variant_context[2], variant_context[3], + abamfile, True, varcon_unmapped_a)) + self.debug_msg("car", variantid, t0) + # Processes a VCF/BCF variant by establishing contexts # def bvcs_process_variant(self): + + # Processes multiple VCF/BCF variants for one sample # def bvcs_process_variants(self): + + # Writes the variant context output files to a specified location # def bvcs_write_output_files(self): diff --git a/vase.py b/vase.py index 37b1031..f9edc68 100644 --- a/vase.py +++ b/vase.py @@ -219,8 +219,6 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): if "P" in runmode: vaseb.run_p_mode(varconfile, paramcheck.get_fastq_out_location()) - def check_ - # Run the program. vaseb = VaSe() From d60ce89e4b75a08e92994e73f5df2a0b2c79771d Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 26 Aug 2019 16:13:03 +0200 Subject: [PATCH 07/90] Add fix for running A and F modes --- VaSeBuilder.py | 12 ++++++------ vase.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 8941e12..4c8ab4c 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -378,7 +378,7 @@ def build_varcon_set(self, sampleidlist, if self.vaselogger.getEffectiveLevel() == 10: self.write_optional_output_files(outpath, self.contexts) acceptorbamfile.close() - return + return self.contexts def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc): """Reads an existing variant context file and fetches donor reads from @@ -1088,13 +1088,13 @@ def run_d_mode(self, variantcontextfile, fq_out): self.vaselogger.info("Finished writing donor FastQ files.") # F-mode: Produce complete validation fastq files - def run_f_mode(self, variantcontexts, fq1_in, fq2_in, fq_out): + def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): self.vaselogger.info("Running VaSeBuilder F-mode") # Combine all donor reads from all variant contexts - add_list = self.contexts.get_all_variant_context_donor_reads() + add_list = variantcontextfile.get_all_variant_context_donor_reads() self.vaselogger.info("Writing FastQ files.") - skip_list = set(self.contexts.get_all_variant_context_acceptor_read_ids()) + skip_list = set(variantcontextfile.get_all_variant_context_acceptor_read_ids()) for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): # Write the fastq files. @@ -1106,10 +1106,10 @@ def run_f_mode(self, variantcontexts, fq1_in, fq2_in, fq_out): self.vaselogger.info("Finished writing FastQ files.") # P-mode: Produce donor fastq files for each variant context - def run_p_mode(self, variantcontexts, fq_out): + def run_p_mode(self, variantcontextfile, fq_out): self.vaselogger.info("Running VaSeBuilder P-mode") self.vaselogger.info("Begin writing variant FastQ files.") - for context in variantcontexts.values(): + for context in variantcontextfile.values(): add_list = context.get_donor_read_strings() self.vaselogger.debug(f"Writing variant FastQs for variant {context.context_id}.") self.build_donor_fq(add_list, "1", fq_out + context.context_id) diff --git a/vase.py b/vase.py index f9edc68..c310e77 100644 --- a/vase.py +++ b/vase.py @@ -200,7 +200,7 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) vaseb.run_ac_mode(paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), - donor_fastq_files, varconfile, paramcheck.get_out_dir_location()) + donor_fastq_files, varconfile, paramcheck.get_fastq_out_location()) else: # Scan the variant and alignment files in the provided lists. vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) From 6279d1a35f44e4b7eba9334fa16386b657e3f8b7 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 27 Aug 2019 15:04:31 +0200 Subject: [PATCH 08/90] Fix C mode --- ReadIdObject.py | 12 +++++++++++- VaSeBuilder.py | 30 +++++++++++++++++++++++++++++- vase.py | 5 +++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/ReadIdObject.py b/ReadIdObject.py index b057958..cc79f7e 100644 --- a/ReadIdObject.py +++ b/ReadIdObject.py @@ -1,10 +1,20 @@ class ReadIdObject: def __init__(self, readid): self.read_id = readid + self.pair_number = "" - # Returns the name of the + # Returns the name/identifier of the read id object def get_bam_read_id(self): return self.read_id + # Sets the name/identifier of the read id object def set_bam_read_id(self, readid): self.read_id = readid + + # Return the pair number of the read id object + def get_bam_read_pair_number(self): + return self.pair_number + + # Sets the pair number of the read id object + def set_bam_read_pair_number(self, pairnum): + self.pair_number = pairnum diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 4c8ab4c..a60a809 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1109,7 +1109,8 @@ def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): def run_p_mode(self, variantcontextfile, fq_out): self.vaselogger.info("Running VaSeBuilder P-mode") self.vaselogger.info("Begin writing variant FastQ files.") - for context in variantcontextfile.values(): + variantcontexts = variantcontextfile.get_variant_contexts() + for context in variantcontexts: add_list = context.get_donor_read_strings() self.vaselogger.debug(f"Writing variant FastQs for variant {context.context_id}.") self.build_donor_fq(add_list, "1", fq_out + context.context_id) @@ -1259,3 +1260,30 @@ def bvcs_estabish_variant_context(self, variantid, variantchrom, variantpos, aco # Writes the variant context output files to a specified location # def bvcs_write_output_files(self): + + # =====REFETCH VARIANT CONTEXT DONOR READS FROM A READ VARIANT CONTEXT FILE + def refetch_donor_reads(self, variantcontextfile, bamsamplemap, reference_loc): + # Get variant contexts per sample id. + sample_list = variantcontextfile.get_variant_contexts_by_sampleid() + + # Iterate through the sample IDs, opening each associated BAM file, or skipping it if it cannot be opened. + for sampleid in sample_list: + try: + bamfile = pysam.AlignmentFile(bamsamplemap[sampleid], reference_filename=reference_loc) + self.vaselogger.debug(f"Opened BAM file {bamsamplemap[sampleid]}") + except IOError: + self.vaselogger.warning(f"Could not open data files for sample {sampleid}. Skipping sample.") + continue + + # For each variant context of the current sample, refetch reads from the context. + for context in sample_list[sampleid]: + context.variant_context_dreads = ( + self.get_variant_reads(context.context_id, + context.variant_context_chrom, + context.variant_context_start, + context.variant_context_end, + bamfile) + ) + # Close the sample's BAM file when done. + bamfile.close() + return diff --git a/vase.py b/vase.py index c310e77..1334f9a 100644 --- a/vase.py +++ b/vase.py @@ -195,12 +195,17 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): paramcheck.get_variant_context_out_location(), variantfilter) else: if "C" in runmode: + # Read an existing variant context file varconfile = VariantContextFile(paramcheck.get_variantcontext_infile()) if "A" in runmode: donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) vaseb.run_ac_mode(paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), donor_fastq_files, varconfile, paramcheck.get_fastq_out_location()) + return + # Refetch the donor reads required when runmode (D,F,P) contains a 'C' + bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) + vaseb.refetch_donor_reads(varconfile, bam_file_map, paramcheck.get_reference_file_location()) else: # Scan the variant and alignment files in the provided lists. vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) From 25166a406a57d11dcbb9b922ee4a4a6ac275d887 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 28 Aug 2019 09:59:05 +0200 Subject: [PATCH 09/90] Start implementation bvcs_process_variant --- VaSeBuilder.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index a60a809..43f81eb 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1253,7 +1253,15 @@ def bvcs_estabish_variant_context(self, variantid, variantchrom, variantpos, aco self.debug_msg("car", variantid, t0) # Processes a VCF/BCF variant by establishing contexts - # def bvcs_process_variant(self): + def bvcs_process_variant(self): + acontext_unmapped = [] + dcontext_unmapped = [] + varcontext_acceptor_unmapped = [] + varcontext_donor_unmapped = [] + + # Establish the acceptor, donor and variant contexts + # self.bvcs_establish_context() + # self.bvcs_estabish_variant_context() # Processes multiple VCF/BCF variants for one sample # def bvcs_process_variants(self): From ab4451c8a02e68c37e4527a42fadffaa43e0dd4a Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 28 Aug 2019 12:59:17 +0200 Subject: [PATCH 10/90] Implement get_variant_reads() into three methods in VaSeBuilder --- VaSeBuilder.py | 99 ++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 76 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 43f81eb..2242304 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -522,8 +522,7 @@ def determine_variant_type(self, vcfvariantref, vcfvariantalts): return "indel" return "?" - # Determines the start and end positions to use for searching reads - # overlapping with the variant. + # Determines the start and end positions to use for searching reads overlapping with the variant. def determine_read_search_window(self, varianttype, vcfvariant): if varianttype == "snp": return [vcfvariant.get_variant_pos() - 1, vcfvariant.get_variant_pos() + 1] @@ -533,8 +532,7 @@ def determine_read_search_window(self, varianttype, vcfvariant): vcfvariant.get_variant_alt_alleles()) return [-1, -1] - # Returns the search start and stop to use for searching BAM reads - # overlapping with the range of the indel. + # Returns the search start and stop to use for searching BAM reads overlapping with the range of the indel. def determine_indel_read_range(self, variantpos, variantref, variantalts): searchstart = variantpos searchstop = variantpos + max( @@ -543,60 +541,47 @@ def determine_indel_read_range(self, variantpos, variantref, variantalts): ) return [searchstart, searchstop] - # Returns the BAM reads containing the specific vcf variant as well - # as their read mate. + # Returns the BAM reads containing the specific vcf variant as well as their read mate. def get_variant_reads(self, contextid, variantchrom, variantstart, variantend, bamfile, write_unm=False, umatelist=None): - # Obtain all the variant reads overlapping with the variant and - # their mate reads. + # Fetch variantreads = [] rpnext = {} - for vread in bamfile.fetch(variantchrom, variantstart, variantend): - variantreads.append(DonorBamRead( - vread.query_name, - self.get_read_pair_num(vread), - vread.reference_name, - vread.reference_start, - vread.infer_read_length(), - vread.get_forward_sequence(), - "".join([chr((x + 33)) - for x in vread.get_forward_qualities()]), - vread.mapping_quality - )) + variantreads.append(DonorBamRead(vread.query_name, self.get_read_pair_num(vread), vread.reference_name, + vread.reference_start, vread.infer_read_length(), + vread.get_forward_sequence(), + "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), + vread.mapping_quality)) rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] + variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) + variantreads = self.uniqify_variant_reads(variantreads) + return variantreads - # Obtain the read mate (this must be done after the initial fetch not during!) - for read1 in rpnext.values(): + # Fetches the read mates for a set of reads from an BAM/CRAM alignment file. + def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): + for read1 in rpnextmap.values(): materead = self.fetch_mate_read(*read1, bamfile) if materead is not None: - variantreads.append(materead) + variantreadlist.append(materead) else: if write_unm: - self.vaselogger.debug("Could not find mate for " - f"{read1[2]} ; " - "mate is likely unmapped.") + self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.") umatelist.append(read1[2]) + return variantreadlist - # Make sure the list only contains each BAM read once (if a read - # and mate both overlap with a variant, they have been added - # twice to the list). - uniq_variantreads = [] + # Uniqifies a list of variant reads to ensure each read only occurs once + def uniqify_variant_reads(self, variantreads): + unique_variantreads = [] checklist = [] for fetched in variantreads: id_pair = (fetched.get_bam_read_id(), fetched.get_bam_read_pair_number()) if id_pair not in checklist: - uniq_variantreads.append(fetched) + unique_variantreads.append(fetched) checklist.append(id_pair) - variantreads = uniq_variantreads - - # Filter to keep only read pairs. - variantreads = self.filter_variant_reads(variantreads) - self.vaselogger.debug(f"Found a total of {len(variantreads)} " - "BAM reads.") - return variantreads + return unique_variantreads # Returns the mate read using the fetch() method def fetch_mate_read(self, rnext, pnext, readid, bamfile): @@ -1124,44 +1109,6 @@ def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomerefe self.build_varcon_set(sampleidlist, donorvcfs, donorbams, acceptorbam, outdir, genomereference, varconout, variantlist) - # =====SPLITTING THE GET_VARIANT_READS() INTO THREE SMALLER METHODS===== - def get_variant_reads_alt(self, variantchrom, variantstart, variantend, bamfile, write_unm=False, umatelist=None): - variantreads = [] - rpnext = {} - for vread in bamfile.fetch(variantchrom, variantstart, variantend): - variantreads.append(DonorBamRead(vread.query_name, self.get_read_pair_num(vread), vread.reference_name, - vread.reference_start, vread.infer_read_length(), - vread.get_forward_sequence(), - "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), - vread.mapping_quality)) - rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] - variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) - variantreads = self.uniqify_variant_reads(variantreads) - return variantreads - - # Fetches the read mates for a set of reads from an BAM/CRAM alignment file. - def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): - for read1 in rpnextmap.values(): - materead = self.fetch_mate_read(*read1, bamfile) - if materead is not None: - variantreadlist.append(materead) - else: - if write_unm: - self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.") - umatelist.append(read1[2]) - return variantreadlist - - # Uniqifies a list of variant reads to ensure each read only occurs once - def uniqify_variant_reads(self, variantreads): - unique_variantreads = [] - checklist = [] - for fetched in variantreads: - id_pair = (fetched.get_bam_read_id(), fetched.get_bam_read_pair_number()) - if id_pair not in checklist: - unique_variantreads.append(fetched) - checklist.append(id_pair) - return unique_variantreads - # =====SPLITTING THE BUILD_VARCON_SET() INTO MULTIPLE SMALLER METHODS===== def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath, variant_list): From cc159ebeb54dc08831883b7fbafb9e5a9bee127f Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 28 Aug 2019 13:29:10 +0200 Subject: [PATCH 11/90] Turn most two line comments into single line comments --- VariantContext.py | 111 ++++++++++++++---------------------------- VariantContextFile.py | 75 ++++++++++------------------ 2 files changed, 62 insertions(+), 124 deletions(-) diff --git a/VariantContext.py b/VariantContext.py index 467350c..1c1a3e0 100644 --- a/VariantContext.py +++ b/VariantContext.py @@ -15,17 +15,13 @@ def __init__(self, varconid, sampleid, self.variant_context_origin = int(varconorigin) self.variant_context_start = int(varconstart) self.variant_context_end = int(varconend) - # Saves the acceptor reads that overlap with the entire variant - # context. + # Saves the acceptor reads that overlap with the entire variant context. self.variant_context_areads = acceptorreads - # Saves the donor reads that overlap with the entire variant - # context. + # Saves the donor reads that overlap with the entire variant context. self.variant_context_dreads = donorreads - # Saves the acceptor context (this is the context from acceptor - # reads overlapping with the variant itself). + # Saves the acceptor context (this is the context from acceptor reads overlapping with the variant itself). self.variant_acceptor_context = acceptor_context - # Saves the donor context (this is the context from donor reads - # overlapping with the variant itself). + # Saves the donor context (this is the context from donor reads overlapping with the variant itself). self.variant_donor_context = donor_context self.unmapped_acceptor_mate_ids = [] self.unmapped_donor_mate_ids = [] @@ -73,39 +69,32 @@ def get_donor_read_strings(self): dbr.get_bam_read_qual())) return list(set(donorreads)) - # Returns the acceptor context (the context from acceptor reads - # overlapping the variant itself). + # Returns the acceptor context (the context from acceptor reads overlapping the variant itself). def get_acceptor_context(self): return self.variant_acceptor_context - # Returns the donor context (the context from donor reads - # overlapping the variant itself). + # Returns the donor context (the context from donor reads overlapping the variant itself). def get_donor_context(self): return self.variant_donor_context - # Returns a list of acceptor reads overlapping with the variant - # context. + # Returns a list of acceptor reads overlapping with the variant context. def get_unmapped_acceptor_mate_ids(self): return self.unmapped_acceptor_mate_ids - # Returns a list of donor reads overlapping with the variant - # context. + # Returns a list of donor reads overlapping with the variant context. def get_unmapped_donor_mate_ids(self): return self.unmapped_donor_mate_ids - # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE=============== - # ===VARIANT CONTEXT=== + # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE VARIANT CONTEXT=============== # Returns the variant context length. def get_variant_context_length(self): return abs(self.variant_context_end - self.variant_context_start) - # Returns the distance of the variant context start position from - # the variant context origin. + # Returns the distance of the variant context start position from the variant context origin. def get_start_distance_from_origin(self): return abs(self.variant_context_origin - self.variant_context_start) - # Returns the distance of the variant context end position from the - # variant context origin. + # Returns the distance of the variant context end position from the variant context origin. def get_end_distance_from_origin(self): return abs(self.variant_context_end - self.variant_context_origin) @@ -116,8 +105,7 @@ def get_number_of_acceptor_reads(self): return 0 return len(self.variant_context_areads) - # Returns the identifiers of acceptor reads overlapping with the - # variant context. + # Returns the identifiers of acceptor reads overlapping with the variant context. def get_acceptor_read_ids(self): if self.variant_context_areads is None: return [None] @@ -129,23 +117,20 @@ def get_acceptor_read_starts(self): return [None] return [x.get_bam_read_ref_pos() for x in self.variant_context_areads] - # Returns a list of the left most positions of the R1 variant - # context accecptor BAM reads. + # Returns a list of the left most positions of the R1 variant context accecptor BAM reads. def get_acceptor_read_left_positions(self): if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.variant_context_areads if x.is_read1()] - # Returns the list of all end positions for all variant context - # acceptor reads. + # Returns the list of all end positions for all variant context acceptor reads. def get_acceptor_read_ends(self): if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_end() for x in self.variant_context_areads] - # Returns a list of the right most positions ofr the R2 variant - # context acceptor BAM read. + # Returns a list of the right most positions ofr the R2 variant context acceptor BAM read. def get_acceptor_read_right_positions(self): if self.variant_context_areads is None: return [None] @@ -157,8 +142,7 @@ def get_acceptor_read_right_positions(self): def get_number_of_donor_reads(self): return len(self.variant_context_dreads) - # Returns the identifiers of donor reads overlapping with the - # variant context. + # Returns the identifiers of donor reads overlapping with the variant context. def get_donor_read_ids(self): return list(set([x.get_bam_read_id() for x in self.variant_context_dreads])) @@ -166,8 +150,7 @@ def get_donor_read_ids(self): def get_donor_read_starts(self): return [x.get_bam_read_ref_pos() for x in self.variant_context_dreads] - # Returns the list of variant context donor read pairs left most - # positions (start pos of read 1). + # Returns the list of variant context donor read pairs left most positions (start pos of read 1). def get_donor_read_left_positions(self): return [x.get_bam_read_ref_pos() for x in self.variant_context_dreads if (x.is_read1())] @@ -176,8 +159,7 @@ def get_donor_read_left_positions(self): def get_donor_read_ends(self): return [x.get_bam_read_ref_end() for x in self.variant_context_dreads] - # Returns a list of all variant context donor reads right most - # positions (end pos of read 2). + # Returns a list of all variant context donor reads right most positions (end pos of read 2). def get_donor_read_right_positions(self): return [x.get_bam_read_ref_end() for x in self.variant_context_dreads if (x.is_read2())] @@ -191,8 +173,7 @@ def set_acceptor_context(self, acceptor_context): def set_donor_context(self, donor_context): self.variant_donor_context = donor_context - # Adds an acceptor context to the variant context by creating it - # from data values. + # Adds an acceptor context to the variant context by creating it from data values. def add_acceptor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, @@ -204,8 +185,7 @@ def add_acceptor_context(self, contextid, sampleid, acceptorreads ) - # Adds a donor context to the variant context by creating it from - # data values. + # Adds a donor context to the variant context by creating it from data values. def add_donor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, @@ -218,13 +198,11 @@ def add_donor_context(self, contextid, sampleid, ) # ===METHODS TO OBTAIN VARIANT CONTEXT UNMAPPED MATE READ DATA============= - # Returns the variant context acceptor read ids that have an - # unmapped mate. + # Returns the variant context acceptor read ids that have an unmapped mate. def get_unmapped_acceptor_read_ids(self): return self.unmapped_acceptor_mate_ids - # Returns the variant context donor read ids that have an unmapped - # mate. + # Returns the variant context donor read ids that have an unmapped mate. def get_unmapped_donor_read_ids(self): return self.unmapped_donor_mate_ids @@ -244,23 +222,19 @@ def set_unmapped_acceptor_mate_ids(self, mateids): def set_unmapped_donor_mate_ids(self, mateids): self.unmapped_donor_mate_ids = mateids - # Returns whether a specified variant context acceptor read has an - # unmapped mate. + # Returns whether a specified variant context acceptor read has an unmapped mate. def acceptor_read_has_unmapped_mate(self, readid): return readid in self.unmapped_acceptor_mate_ids - # Returns whether a specified variant context donor read has an - # unmapped mate. + # Returns whether a specified variant context donor read has an unmapped mate. def donor_read_has_unmapped_mate(self, readid): return readid in self.unmapped_donor_mate_ids - # Returns the number of variant context acceptor reads with an - # unmapped mate. + # Returns the number of variant context acceptor reads with an unmapped mate. def get_number_of_unmapped_acceptor_mates(self): return len(self.unmapped_acceptor_mate_ids) - # Returns the number of variant context donor reads with an - # unmapped mate. + # Returns the number of variant context donor reads with an unmapped mate. def get_number_of_unmapped_donor_mates(self): return len(self.unmapped_donor_mate_ids) @@ -282,13 +256,11 @@ def add_donor_context_unmapped_mate_id(self, ureadid): self.variant_donor_context.setUnmappedMateId(ureadid) # ===METHODS TO OBTAIN STATISTICS OF THE VARIANT CONTEXT=================== - # Returns the average and median quality of the acceptor reads - # associated with + # Returns the average and median quality of the acceptor reads associated with def get_average_and_median_acceptor_read_qual(self): return self.get_average_and_median_read_qual(self.variant_context_areads) - # Returns the average and median quality of the donor reads - # associated with this variant context. + # Returns the average and median quality of the donor reads associated with this variant context. def get_average_and_median_donor_read_qual(self): return self.get_average_and_median_read_qual(self.variant_context_dreads) @@ -302,13 +274,11 @@ def get_average_and_median_read_qual(self, contextreads): statistics.median(avgmedqual)]) return [None, None] - # Returns the average and median mapq values of the acceptor reads - # associated with this variant context. + # Returns the average and median mapq values of the acceptor reads associated with this variant context. def get_average_and_median_acceptor_read_mapq(self): return self.get_average_and_median_read_mapq(self.variant_context_areads) - # Returns the average and median mapq values of the donor reads - # associated with this variant context. + # Returns the average and median mapq values of the donor reads associated with this variant context. def get_average_and_median_donor_read_mapq(self): return self.get_average_and_median_read_mapq(self.variant_context_dreads) @@ -322,13 +292,11 @@ def get_average_and_median_read_mapq(self, contextreads): statistics.median(avgmedmapq)]) return [None, None] - # Returns the average and median length of the acceptor reads - # associated with this variant context. + # Returns the average and median length of the acceptor reads associated with this variant context. def get_average_and_median_acceptor_read_length(self): return self.get_average_and_median_read_length(self.variant_context_areads) - # Returns the average and median length of the donor reads - # associated with this variant context. + # Returns the average and median length of the donor reads associated with this variant context. def get_average_and_median_donor_read_length(self): return self.get_average_and_median_read_length(self.variant_context_dreads) @@ -347,13 +315,11 @@ def get_average_and_median_read_length(self, contextreads): def has_acceptor_context(self): return self.variant_acceptor_context is not None - # Returns the acceptor context identifier (should be the same as the - # variant context id). + # Returns the acceptor context identifier (should be the same as the variant context id). def get_acceptor_context_id(self): return self.variant_acceptor_context.get_context_id() - # Returns the acceptor context sample id (should be the same as the - # variant context sample id). + # Returns the acceptor context sample id (should be the same as the variant context sample id). def get_acceptor_context_sample_id(self): return self.variant_acceptor_context.get_sample_id() @@ -414,13 +380,11 @@ def get_acceptor_context_unmapped_mate_ids(self): def has_donor_context(self): return self.variant_donor_context is not None - # Returns the donor context identifier (should be the same as the - # variant context id). + # Returns the donor context identifier (should be the same as the variant context id). def get_donor_context_id(self): return self.variant_donor_context.get_context_id() - # Returns the acceptor context sample id (should be the same as the - # variant context sample id). + # Returns the acceptor context sample id (should be the same as the variant context sample id). def get_donor_context_sample_id(self): return self.variant_donor_context.get_sample_id() @@ -509,8 +473,7 @@ def to_string(self): + str(list_areads) + "\t" + str(list_dreads)) - # Returns a varconstats.txt string representation of the variant - # context. + # Returns a varconstats.txt string representation of the variant context. def to_statistics_string(self): areadlen = self.get_average_and_median_acceptor_read_length() dreadlen = self.get_average_and_median_donor_read_length() diff --git a/VariantContextFile.py b/VariantContextFile.py index 9052e8d..bfd20f9 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -151,8 +151,7 @@ def get_donor_context_reads(self, contextid): return [] # ===BASIC VARIANTCONTEXTFILE METHODS====================================== - # Reads a provided variant context file and saves data according to - # set filters. + # Reads a provided variant context file and saves data according to set filters. def read_variant_context_file(self, fileloc, samplefilter=None, IDfilter=None, chromfilter=None): try: @@ -234,29 +233,25 @@ def passes_filter(self, valtocheck, filterlist): return True # ===VARIANT CONTEXT SET OPERATIONS (UNION, INTERSECT, DIFFERENCE)========= - # Returns the list of variant context identifiers from both variant - # context files. + # Returns the list of variant context identifiers from both variant context files. def get_variant_contexts_union(self, other_varcon_file): own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) | set(other_varcon_ids)) - # Returns the list of variant context identifiers present in both - # variant context files. + # Returns the list of variant context identifiers present in both variant context files. def get_variant_contexts_intersect(self, other_varcon_file): own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) & set(other_varcon_ids)) - # Returns the list of variant context identifiers in this file but - # not present in the other variant context file. + # Returns the list of variant context identifiers in this file but not present in the other variant context file. def get_variant_contexts_difference(self, other_varcon_file): own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) - set(other_varcon_ids)) - # Returns the list of variant context identifiers only present in - # one of the variant context file but not the other. + # Returns the list of variant context identifiers only present in one of the variant context file but not the other. def get_variant_contexts_symmetric_difference(self, other_varcon_file): own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() @@ -296,8 +291,7 @@ def variant_is_in_context(self, varianttype, searchchrom, searchstop) return None - # Determines whether an SNP variant is located in an already - # existing variant context. + # Determines whether an SNP variant is located in an already existing variant context. def snp_variant_is_in_context(self, varchrom, vcfvarpos): for varcon in self.variant_contexts.values(): if varchrom == varcon.get_variant_context_chrom(): @@ -306,9 +300,8 @@ def snp_variant_is_in_context(self, varchrom, vcfvarpos): return True return False - # Determines whether an indel variant is located within an existing - # variant context (indelLeftPos and indelRightPos can be the used - # search window). + # Determines whether an indel variant is located within an existing variant context (indelLeftPos and indelRightPos + # can be the used search window). def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos): for varcon in self.variant_contexts.values(): if (indelchrom == varcon.get_variant_context_chrom()): @@ -358,8 +351,7 @@ def set_acceptor_context(self, varconid, acceptor_context): if varconid in self.variant_contexts: self.variant_contexts[varconid].set_acceptor_context(acceptor_context) - # Add a newly created acceptor context to an existing variant - # context. + # Add a newly created acceptor context to an existing variant context. def add_acceptor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, @@ -388,8 +380,7 @@ def add_donor_context(self, contextid, sampleid, contextstart, contextend, donorreads) - # ===METHODS TO ADD UNMAPPED MATE IDS TO ACCEPTOR, DONOR AND VARIANT======= - # ===CONTEXT=== + # ===METHODS TO ADD UNMAPPED MATE IDS TO ACCEPTOR, DONOR AND VARIANT CONTEXT======= # Sets the unmapped mate read id for a specified acceptor context. def set_acceptor_context_unmapped_mate_ids(self, contextid, mateids): if contextid in self.variant_contexts: @@ -400,8 +391,7 @@ def set_donor_context_unmapped_mate_ids(self, contextid, mateids): if contextid in self.variant_contexts: self.variant_contexts[contextid].set_donor_context_unmapped_mates(mateids) - # Sets the acceptor unmapped mate ids for a specified variant - # context. + # Sets the acceptor unmapped mate ids for a specified variant context. def set_unmapped_acceptor_mate_ids(self, contextid, mateids): if contextid in self.variant_contexts: self.variant_contexts[contextid].set_unmapped_acceptor_mate_ids(mateids) @@ -411,45 +401,38 @@ def set_unmapped_donor_mate_ids(self, contextid, mateids): if contextid in self.variant_contexts: self.variant_contexts[contextid].set_unmapped_donor_mate_ids(mateids) - # ===METHODS TO GET UNMAPPED MATE IDS OF AN ACCEPTOR, DONOR AND============ - # ===VARIANT CONTEXT=== - # Return the unmapped read mate identifiers of a specified acceptor - # context. + # ===METHODS TO GET UNMAPPED MATE IDS OF AN ACCEPTOR, DONOR AND VARIANT CONTEXT============ + # Return the unmapped read mate identifiers of a specified acceptor context. def get_acceptor_context_unmapped_mate_ids(self, contextid): if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_context_unmapped_mate_ids() return [] - # Returns the unmapped read mate identifiers of a specified donor - # context. + # Returns the unmapped read mate identifiers of a specified donor context. def get_donor_context_unmapped_mate_ids(self, contextid): if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_context_unmapped_mate_ids() return [] - # Returns the unmapped acceptor read mate identifiers of a specified - # variant context. + # Returns the unmapped acceptor read mate identifiers of a specified variant context. def get_unmapped_acceptor_mate_ids(self, contextid): if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_unmapped_acceptor_read_ids() return [] - # Returns the unmapped donor read mate identifiers of a specified - # variant context. + # Returns the unmapped donor read mate identifiers of a specified variant context. def get_unmapped_donor_mate_ids(self, contextid): if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_unmapped_donor_read_ids() return [] # ===METHODS TO OBTAIN SOME STATISTICS ABOUT ALL THE CONTEXTS============== - # Returns the average variant context length within this variant - # context file. + # Returns the average variant context length within this variant context file. def get_average_variant_context_length(self): return statistics.mean([varcon.get_variant_context_length() for varcon in self.variant_contexts.values()]) - # Returns the median variant context length within this variant - # context file. + # Returns the median variant context length within this variant context file. def get_median_variant_context_length(self): return statistics.median([varcon.get_variant_context_length() for varcon in self.variant_contexts.values()]) @@ -475,7 +458,6 @@ def get_median_variant_context_donor_reads(self): def write_variant_context_file(self, outfileloc, samplefilter=None, varconfilter=None, chromfilter=None): try: - with open(outfileloc, "w") as varcon_outfile: varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" "Start\tEnd\tAcceptorContextLength\t" @@ -529,8 +511,7 @@ def write_acceptor_context_file(self, outfileloc, samplefilter=None, self.vaselogger.warning("Could not write acceptor contexts to " f"{ioe.filename}") - # Writes the acceptor cotnexts used to construct the variant - # contexts. + # Writes the acceptor cotnexts used to construct the variant contexts. def write_donor_context_file(self, outfileloc, samplefilter=None, contextfilter=None, chromfilter=None): try: @@ -558,8 +539,7 @@ def write_donor_context_file(self, outfileloc, samplefilter=None, self.vaselogger.warning("Could not write donor contexts to " f"{ioe.filename}") - # Writes some statistics about the acceptor and donor reads - # identified for each variant context. + # Writes some statistics about the acceptor and donor reads identified for each variant context. def write_variant_context_stats(self, statsoutloc): try: with open(statsoutloc, "w") as varcon_statsfile: @@ -574,8 +554,7 @@ def write_variant_context_stats(self, statsoutloc): self.vaselogger.critical("Could not write variant context " f"statistics to {statsoutloc}") - # Writes some statistics about the acceptor and donor reads - # identified for each variant context. + # Writes some statistics about the acceptor and donor reads identified for each variant context. def write_acceptor_context_stats(self, statsoutloc): try: with open(statsoutloc, "w") as varcon_statsfile: @@ -591,8 +570,7 @@ def write_acceptor_context_stats(self, statsoutloc): self.vaselogger.critical("Could not write acceptor context " f"statistics to {statsoutloc}") - # Writes some statistics about the acceptor and donor reads - # identified for each variant context. + # Writes some statistics about the acceptor and donor reads identified for each variant context. def write_donor_context_stats(self, statsoutloc): try: with open(statsoutloc, "w") as varcon_statsfile: @@ -608,8 +586,7 @@ def write_donor_context_stats(self, statsoutloc): self.vaselogger.critical("Coud not write donor context statistics " f"to {statsoutloc}") - # Writes the left and right positions to the output file. Left pos - # for R1 and right pos for R2. + # Writes the left and right positions to the output file. Left pos for R1 and right pos for R2. def write_left_right_positions(self, typetowrite, outfileloc): try: with open(outfileloc, "w") as lrpof: @@ -641,8 +618,7 @@ def write_left_right_positions(self, typetowrite, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - # Writes the acceptor context left and right positions to the output - # file. Left pos for R1 and right pos for R2. + # Writes the acceptor context left and right positions to the output file. Left pos for R1 and right pos for R2. def write_acceptor_left_right_positions(self, outfileloc): try: with open(outfileloc, "w") as lrpof: @@ -663,8 +639,7 @@ def write_acceptor_left_right_positions(self, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - # Writes the left and right positions to the output file. Left pos - # for R1 and right pos for R2. + # Writes the left and right positions to the output file. Left pos for R1 and right pos for R2. def write_donor_left_right_positions(self, outfileloc): try: with open(outfileloc, "w") as lrpof: From c86163fddc31297fa9829d7c94f7d26bfa1ab71a Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 2 Sep 2019 10:16:22 +0200 Subject: [PATCH 12/90] Small update to OverlapContext and VariantContextFile.py --- OverlapContext.py | 4 ++++ VariantContextFile.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/OverlapContext.py b/OverlapContext.py index a65e106..787400f 100644 --- a/OverlapContext.py +++ b/OverlapContext.py @@ -16,6 +16,10 @@ def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, self.unmapped_read_mate_ids = [] # ===METHODS TO GET DATA OF THE OVERLAP CONTEXT============================ + # Returns the context data (chrom, pos, start, end) in an array + def get_context(self): + return [self.context_chrom, self.context_origin, self.context_start, self.context_end] + # Returns the context identifier. def get_context_id(self): return self.context_id diff --git a/VariantContextFile.py b/VariantContextFile.py index bfd20f9..b95e848 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -346,6 +346,11 @@ def add_variant_context(self, varconid, sampleid, acceptor_context, donor_context) self.variant_contexts[varconid] = varcon_obj + # Adds an existing variant context to the variant context file + def add_existing_variant_context(self, varconid, varconobj): + if varconobj is not None: + self.variant_contexts[varconid] = varconobj + # Adds an acceptor context object to a variant context. def set_acceptor_context(self, varconid, acceptor_context): if varconid in self.variant_contexts: From 81645d3d8fb79e9256db278d7e4b9797672bfe4a Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 2 Sep 2019 13:58:27 +0200 Subject: [PATCH 13/90] Add set of bvcs_ methods that splits build_varcon_set into multiple smaller methods --- VaSeBuilder.py | 236 +++++++++++++++++++++++++++---------------------- 1 file changed, 128 insertions(+), 108 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 2242304..8d2bed8 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -12,6 +12,8 @@ from DonorBamRead import DonorBamRead from VariantContextFile import VariantContextFile from VcfVariant import VcfVariant +from VariantContext import VariantContext +from OverlapContext import OverlapContext class VaSeBuilder: @@ -1111,33 +1113,124 @@ def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomerefe # =====SPLITTING THE BUILD_VARCON_SET() INTO MULTIPLE SMALLER METHODS===== def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath, - variant_list): - self.vaselogger.info("Begin building the variant context file.") + variantlist): donor_vcfs_used = [] donor_bams_used = [] + variantcontexts = VariantContextFile() - # Open the acceptor bam, or exit if this fails. try: - acceptorbamfile = pysam.AlignmentFile(acceptorbamloc, reference_filename=reference_loc) + acceptorbamfile = pysam.AlignmentFile(acceptorbamloc, reference_name=reference_loc) except IOError: - self.vaselogger.critical("Could not open acceptor BAM file. Exitting.") + self.vaselogger.critical("Could not open Acceptor BAM/CRAM") exit() - # Iterate over the provided samples. - for sampleid in sampleidlist: - self.vaselogger.debug(f"Processing data for sample {sampleid}.") + # Start iterating over the samples + for sampleid in sampleidlist: + self.vaselogger.debug(f"Start processing sample {sampleid}") + sample_variant_filter = self.bvcs_set_variant_filter(sampleid, variantlist) + samplevariants = self.get_sample_vcf_variants(vcfsamplemap[sampleid], sample_variant_filter) - # Only include variants from VCF files if they are included in the provided (optional) variant list. - sample_variant_filter = self.bvcs_set_variant_filter(sampleid, variant_list) - samplevariants = self.get_sample_vcf_variants(vcfsamplemap[sampleid], sample_variant_filter) + if not samplevariants: + self.vaselogger.warning(f"No variants obtained for sample {sampleid}. Skipping sample") + continue - # Skip this sample if all of its variants got filtered out. - if not samplevariants: - self.vaselogger.warning(f"No variants obtained for sample {sampleid}. Skipping sample") - continue + # Call the method that will process the sample + self.bvcs_process_sample(sampleid, variantcontexts, acceptorbamfile, bamsamplemap[sampleid], + reference_loc, samplevariants) + + # Add the used donor VCF and BAM to the lists of used VCF and BAM files + donor_bams_used.append(vcfsamplemap[sampleid]) + donor_vcfs_used.append(bamsamplemap[sampleid]) + + # Write the output variant context output data + self.bvcs_write_output_files(outpath, varcon_outpath, variantcontexts, vcfsamplemap, bamsamplemap, + donor_vcfs_used, donor_bams_used) + + # Checks whether the program is running on debug and, if so, write some extra output files. + if self.vaselogger.getEffectiveLevel() == 10: + self.write_optional_output_files(outpath, variantcontexts) + + # Processes a sample + def bvcs_process_sample(self, sampleid, variantcontextfile, abamfile, dbamfileloc, referenceloc, samplevariants): + try: + donorbamfile = pysam.AlignmentFile(dbamfileloc, reference_name=referenceloc) + except IOError: + self.vaselogger.warning(f"Could not open {dbamfileloc} ; Skipping {sampleid}") + return + + # Iterate over the sample variants + for samplevariant in samplevariants: + variantcontext = self.bvcs_process_variant(sampleid, variantcontextfile, samplevariant, abamfile, + donorbamfile, True) + if not variantcontext: + self.vaselogger.info(f"Could not establish variant context ; Skipping.") + continue + variantcontextfile.add_existing_variant_context(variantcontext) + + # Processes a sample variant by establishing the contexts. + def bvcs_process_variant(self, sampleid, variantcontextfile, samplevariant, abamfile, dbamfile, write_unm=False): + # Establish the donor context + variantid = samplevariant.get_variant_id() + variantchrom = samplevariant.get_variant_chrom() + variantpos = samplevariant.get_variant_pos() + varianttype = samplevariant.get_variant_type() + + searchwindow = self.determine_read_search_window(varianttype, samplevariant) + + dcontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, dbamfile, + write_unm) + if not dcontext: + self.vaselogger.info(f"Could not establish donor context ; Skipping variant {variantid}") + return None + + acontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, abamfile, + write_unm, dcontext.get_context()) + vcontext = self.bvcs_establish_variant_context(sampleid, variantcontextfile, variantid, variantchrom, + variantpos, acontext, dcontext, abamfile, dbamfile, write_unm) + return vcontext + + # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. + def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid, variantchrom, variantpos, + acontext, dcontext, abamfile, dbamfile, write_unm=False): + unmapped_alist = [] + unmapped_dlist = [] + + # Determine the variant context from the acceptor and donor context + vcontext_window = self.determine_largest_context(variantpos, acontext.get_context(), dcontext.get_context()) + if variantcontextfile.context_collision(vcontext_window): + self.vaselogger.debug(f"Variant context {variantid} overlaps with an already existing context; Skipping.") + return None + + vcontext_dreads = self.get_variant_reads(variantid, variantchrom, *vcontext_window, dbamfile, write_unm, + unmapped_dlist) + vcontext_areads = self.get_variant_reads(variantid, variantchrom, *vcontext_window, abamfile, write_unm, + unmapped_alist) + variant_context = VariantContext(variantid, sampleid, *vcontext_window, vcontext_areads, vcontext_dreads, + acontext, + dcontext) + return variant_context + + # Establishes an acceptor/donor context by fetching reads (and their mates) overlapping directly with the variant + def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, searchwindow, bamfile, + write_unm=False, + fallback_window=None): + unmappedlist = [] + + # Fetch context reads and establish the context window + context_reads = self.get_variant_reads(variantid, variantchrom, *searchwindow, bamfile, write_unm, unmappedlist) + context_window = self.determine_context(context_reads, variantpos, variantchrom) + + # Check whether the context_window is valid and, if not, whether a fallback window has been set + if not context_window: + if fallback_window: + context_window = fallback_window + else: + return None - # Process all variants for the current sample - self.bvcs_process_variants() + # Create the context object containing all context data + adcontext = OverlapContext(variantid, sampleid, *context_window, context_reads) + adcontext.set_unmapped_mate_ids(unmappedlist) + return adcontext # Sets the variant list for a sample if applicable, otherwise return None def bvcs_set_variant_filter(self, sampleid, variant_list): @@ -1147,98 +1240,25 @@ def bvcs_set_variant_filter(self, sampleid, variant_list): sample_variant_filter = variant_list[sampleid] return sample_variant_filter - # Establishes an acceptor/donor context by fetching reads (and their mates) overlapping directly with the variant - def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, searchwindow, bamfile, - write_unm=False, unmappedlist=None): - # Obtain all donor BAM reads at the variant position, as well as their mates. - self.vaselogger.debug(f"Searching donor reads for variant {variantid}") - t0 = time.time() - context_reads = (self.get_variant_reads(variantid, variantchrom, *searchwindow, bamfile, write_unm, unmappedlist)) - self.vaselogger.debug("Searching donor reads for {variantid} took {time.time() - t0} seconds") - - # If no donor reads were found at this position, then skip this variant. - if not context_reads: - self.vaselogger.info(f"No reads found for variant {variantid} in donor {sampleid}; Skipping.") - return [] + # Writes the output files + def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, vcfsamplemap, bamsamplemap, + used_donor_vcfs, used_donor_bams): + # Write the variant context file itself + self.vaselogger.info(f"Writing variant contexts to {varcon_outpath}") + variantcontextfile.write_variant_context_file(varcon_outpath) - # Determine the context based on the reads overlapping the variant. - self.vaselogger.debug("Determining the context for variant {variantid}") - t0 = time.time() - adcontext = (self.determine_context(context_reads, variantpos, variantchrom)) - self.vaselogger.debug("Determining the context for {variantid} took {time.time() - t0} seconds") - return adcontext + # Write the variant context statistics file + self.vaselogger.info(f"Writing variant context statistics to {outpath}varconstats.txt") + variantcontextfile.write_variant_context_stats(f"{outpath}varconstats.txt") - # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. - def bvcs_estabish_variant_context(self, variantid, variantchrom, variantpos, acontext, dcontext, abamfile, - dbamfile, write_unm=False, unmappedlist=None): - # Determine the combined variant context based on the widest window from both the donor and acceptor positions. - self.vaselogger.debug("Establishing combined variant context for variant {variantid}") - t0 = time.time() - variant_context = self.determine_largest_context(variantpos, acontext, dcontext) - self.vaselogger.debug(f"Establishing the combined variant context for variant {variantid} took " - f"{time.time() - t0} seconds") - - # If this widest context overlaps an existing variant context, skip it. - if self.contexts.context_collision(variant_context): - self.vaselogger.debug(f"Variant context {variantid} overlaps with an already existing context; Skipping.") - return [] + # Write the variant contexts as a BED file + self.vaselogger.info(f"Writing the variant contexts to a BED file") + self.write_bed_file(variantcontextfile.get_variant_contexts()) - # Obtain all donor reads overlapping the combined variant context, as well as their mates. - self.debug_msg("cdr", variantid) - t0 = time.time() - variant_context_donor_reads = ( - self.get_variant_reads(variantid, variant_context[0], variant_context[2], variant_context[3], dbamfile, - True, varcon_unmapped_d)) - self.debug_msg("cdr", variantid, t0) - - # Obtain all acceptor reads overlapping the combined variant context, as well as their mates. - self.debug_msg("car", variantid) - t0 = time.time() - variant_context_acceptor_reads = ( - self.get_variant_reads(variantid, variant_context[0], variant_context[2], variant_context[3], - abamfile, True, varcon_unmapped_a)) - self.debug_msg("car", variantid, t0) - - # Processes a VCF/BCF variant by establishing contexts - def bvcs_process_variant(self): - acontext_unmapped = [] - dcontext_unmapped = [] - varcontext_acceptor_unmapped = [] - varcontext_donor_unmapped = [] - - # Establish the acceptor, donor and variant contexts - # self.bvcs_establish_context() - # self.bvcs_estabish_variant_context() - - # Processes multiple VCF/BCF variants for one sample - # def bvcs_process_variants(self): - - # Writes the variant context output files to a specified location - # def bvcs_write_output_files(self): - - # =====REFETCH VARIANT CONTEXT DONOR READS FROM A READ VARIANT CONTEXT FILE - def refetch_donor_reads(self, variantcontextfile, bamsamplemap, reference_loc): - # Get variant contexts per sample id. - sample_list = variantcontextfile.get_variant_contexts_by_sampleid() - - # Iterate through the sample IDs, opening each associated BAM file, or skipping it if it cannot be opened. - for sampleid in sample_list: - try: - bamfile = pysam.AlignmentFile(bamsamplemap[sampleid], reference_filename=reference_loc) - self.vaselogger.debug(f"Opened BAM file {bamsamplemap[sampleid]}") - except IOError: - self.vaselogger.warning(f"Could not open data files for sample {sampleid}. Skipping sample.") - continue + # Write a listfile of the used variant (VCF/BCF) files + self.vaselogger.info(f"Writing the used donor variant files per sample to {outpath}donor_variant_files.txt") + self.write_used_donor_files(f"{outpath}donorvcfs.txt", vcfsamplemap, used_donor_vcfs) - # For each variant context of the current sample, refetch reads from the context. - for context in sample_list[sampleid]: - context.variant_context_dreads = ( - self.get_variant_reads(context.context_id, - context.variant_context_chrom, - context.variant_context_start, - context.variant_context_end, - bamfile) - ) - # Close the sample's BAM file when done. - bamfile.close() - return + # Write a listfile of the used alignment (BAM/CRAM) files + self.vaselogger.info(f"Writing the used donor alignment files per sample to {outpath}donor_alignment_files.txt") + self.write_used_donor_files(f"{outpath}donor_alignment_files.txt", bamsamplemap, used_donor_bams) From d656fde09de0cba247d08bd56c08e65345fc184e Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 2 Sep 2019 14:04:08 +0200 Subject: [PATCH 14/90] Remove A and XC from valid_runmodes --- vase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vase.py b/vase.py index 1334f9a..c623bbc 100644 --- a/vase.py +++ b/vase.py @@ -22,7 +22,7 @@ def __init__(self): "higher" assert (int(pysam.version.__version__.split(".")[0]) >= 0 and int(pysam.version.__version__.split(".")[1]) >= 15), "Please run this program with Pysam 0.15 or higher" - self.valid_runmodes = ["A", "AC", "D", "DC", "F", "FC", "P", "PC", "X", "XC"] + self.valid_runmodes = ["AC", "D", "DC", "F", "FC", "P", "PC", "X"] # Runs the program. def main(self): From 6ce2ed7758ec79c73f6ff8a88b540aef99f46f79 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 2 Sep 2019 14:08:41 +0200 Subject: [PATCH 15/90] Put get_vase_parameters in PEP8 format --- vase.py | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/vase.py b/vase.py index c623bbc..753aa6a 100644 --- a/vase.py +++ b/vase.py @@ -108,21 +108,36 @@ def start_logger(self, paramcheck, logloc, debug_mode=False): def get_vase_parameters(self): # Set the VaSe parameters for the program. vase_argpars = argparse.ArgumentParser() - vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes, help="RUNMODE HELP") - vase_argpars.add_argument("-v", "--donorvcf", dest="donorvcf", help="File containing a list of VCF/VCF.GZ/BCF files.") + vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes, + help="RUNMODE HELP") + vase_argpars.add_argument("-v", "--donorvcf", dest="donorvcf", + help="File containing a list of VCF/VCF.GZ/BCF files.") vase_argpars.add_argument("-b", "--donorbam", dest="donorbam", help="File containing a list of BAM/CRAM files.") - vase_argpars.add_argument("-a", "--acceptorbam", dest="acceptorbam", help="BAM file for identifying acceptor reads to exclude.") - vase_argpars.add_argument("-1", "--templatefq1", dest="templatefq1", nargs="*", help="Location and name of the first fastq in file.") - vase_argpars.add_argument("-2", "--templatefq2", dest="templatefq2", nargs="*", help="Location and name of the second fastq in file.") + vase_argpars.add_argument("-a", "--acceptorbam", dest="acceptorbam", + help="BAM file for identifying acceptor reads to exclude.") + vase_argpars.add_argument("-1", "--templatefq1", dest="templatefq1", nargs="*", + help="Location and name of the first fastq in file.") + vase_argpars.add_argument("-2", "--templatefq2", dest="templatefq2", nargs="*", + help="Location and name of the second fastq in file.") vase_argpars.add_argument("-o", "--out", dest="out", help="Directory to write output files to.") - vase_argpars.add_argument("-r", "--reference", dest="reference", help="Location of the reference genome. This reference genome should be used by all VCF/BCF and BAM/CRAM files.") - vase_argpars.add_argument("-of", "--fastqout", dest="fastqout", help="Name for the two FastQ files to be produced.") - vase_argpars.add_argument("-ov", "--varcon", dest="varcon", help="File name to write variants and their contexts to.") - vase_argpars.add_argument("-l", "--log", dest="log", help="Location to write log files to (will write to working directory if not used).") - vase_argpars.add_argument("-!", "--debug", dest="debug", action="store_true", help="Run the program in debug mode") - vase_argpars.add_argument("-vl", "--variantlist", dest="variantlist", help="File containing a list of variants to use. Will only use these variants if provided. Will use all variants if no list is provided.") - vase_argpars.add_argument("-iv", "--varconin", dest="varconin", help="Provide a Vasebuilder output variant context file to build a validation set.") - vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", help="Location to donor fastq list file") + vase_argpars.add_argument("-r", "--reference", dest="reference", + help="Location of the reference genome. This reference genome should be used by all" + "VCF/BCF and BAM/CRAM files.") + vase_argpars.add_argument("-of", "--fastqout", dest="fastqout", + help="Name for the two FastQ files to be produced.") + vase_argpars.add_argument("-ov", "--varcon", dest="varcon", + help="File name to write variants and their contexts to.") + vase_argpars.add_argument("-l", "--log", dest="log", + help="Location to write log files to (will write to working directory if not used).") + vase_argpars.add_argument("-!", "--debug", dest="debug", action="store_true", + help="Run the program in debug mode") + vase_argpars.add_argument("-vl", "--variantlist", dest="variantlist", + help="File containing a list of variants to use. Will only use these variants if " + "provided. Will use all variants if no list is provided.") + vase_argpars.add_argument("-iv", "--varconin", dest="varconin", + help="Provide a Vasebuilder output variant context file to build a validation set.") + vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", + help="Location to donor fastq list file") vase_argpars.add_argument("-c", "--config", dest="configfile", help="Supply a config file") vase_args = vars(vase_argpars.parse_args()) return vase_args From a460db2120edece374464dbee89473abe9ef7b2c Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 4 Sep 2019 13:05:06 +0200 Subject: [PATCH 16/90] Start adding docstrings to code for documentation purposes --- DonorBamRead.py | 28 +++++++++++++++++++-- OverlapContext.py | 64 +++++++++++++++++++++++++++++++++++++++-------- ParamChecker.py | 7 ++++++ ReadIdObject.py | 16 ++++++++++++ VcfBamScanner.py | 55 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 155 insertions(+), 15 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 5a9d38d..3252a17 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -2,6 +2,9 @@ class DonorBamRead: + """The DonorBamRead is used to save data from an aligned read extracted from a BAM or CRAM file. + + DonorBamRead can be used to save the data from aligned reads that were extracted from a BAM or CRAM file.""" # Saves the required BAM read data. def __init__(self, readid, readpn, readchrom, readstart, readlen, readseq, readquals, mapqual): @@ -17,41 +20,52 @@ def __init__(self, readid, readpn, readchrom, readstart, # ===METHODS TO GET SAVED DATA FROM THE DONORBAMREAD======================= # Returns the BAM read identifier. def get_bam_read_id(self): + """Returns the identifier of the read.""" return self.bam_read_id # Returns the BAM read pair number (1 or 2). def get_bam_read_pair_number(self): + """Returns the pair number of the read. + This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads""" return self.bam_read_pairnum # Returns the BAM read chromosome. def get_bam_read_chrom(self): + """Returns the name of the chromosome where the read has been mapped.""" return self.bam_read_chrom # Returns the BAM read starting position on the reference sequence. def get_bam_read_ref_pos(self): + """Returns the genomic mapping position (the left most position) of the read.""" return self.bam_read_ref_pos # Returns the BAM read length. def get_bam_read_length(self): + """Returns the length of the read. In VaSeBuilder this length is calculated by means of the CIGAR string by + using the pysam method infer_read_length()""" return self.bam_read_length - # Returns the BAM read ending position on the reference (calculated - # as starting position + the length of the read). + # Returns the BAM read ending position on the reference (calculated as starting position + the length of the read). def get_bam_read_ref_end(self): + """Returns the """ if self.bam_read_length is not None: return self.bam_read_ref_pos + self.bam_read_length return -1 # Returns the BAM read sequence. def get_bam_read_sequence(self): + """Returns the read sequence as a String.""" return self.bam_read_seq # Returns the BAM read quality scores. def get_bam_read_qual(self): + """Returns the """ return self.bam_read_qual # Returns the BAM read quality as an array of Q-Scores. def get_bam_read_q_scores(self): + """Converts the String of ASCII quality scores to a list Q-Scores. The Q-Score for each ASCII quality character + are calculated by obtaining the unicode code point and subtracting 33.""" qscores = [] for qualSymbol in self.bam_read_qual: qscores.append(ord(qualSymbol)-33) @@ -59,31 +73,37 @@ def get_bam_read_q_scores(self): # Returns the maping quality of the BAM read. def get_mapping_qual(self): + """Returns the mapping quality that was assigned to the read.""" return self.bam_read_map_qual # ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD=================== # Returns the average Q-Score. def get_average_qscore(self): + """Calculates and returns the mean Q-Score of the read.""" qscores = self.get_bam_read_q_scores() return statistics.mean(qscores) # Returns the median Q-Score. def get_median_qscore(self): + """Calculates and returns the median Q-Score of the read.""" qscores = self.get_bam_read_q_scores() return statistics.median(qscores) # ===METHODS TO CHECK WHETHER THE BAM READ IS R1 OR R2===================== # Returns if the BAM read is the first (forward) read. def is_read1(self): + """Returns whether the read is the forward/R1 read by checking if the pair number is set to '1'.""" return self.bam_read_pairnum == "1" # Returns if the BAM read is the second (reverse) read. def is_read2(self): + """Returns whether the read is the reverse/R2 read by checking if the pair number is set to '2'.""" return self.bam_read_pairnum == "2" # ===METHODS TO RETURN A STRING REPRESENTATION OF THE DONORBAMREAD OBJECT== # Returns a String representation. def to_string(self): + """Returns a String with all the saved data, separated by tabs, of the read.""" return (str(self.bam_read_id) + "\t" + str(self.bam_read_pairnum) + "\t" + str(self.bam_read_chrom) + "\t" @@ -95,6 +115,10 @@ def to_string(self): # Returns the BAM read as a fastq sequence. def get_as_fastq_seq(self, addpairnum=False): + """Returns the read as a FastQ entry. + + If the 'addpairnum' argument is set to True the read pair number will be added at the end of the read + identifier as '/1' or '/2', depending on whether the read is the forward/R1 or reverse/R2 read.""" if addpairnum: return ("@" + str(self.bam_read_id) + "/" + str(self.bam_read_pairnum) + "\n" + str(self.bam_read_seq) + "\n" diff --git a/OverlapContext.py b/OverlapContext.py index 787400f..18583b0 100644 --- a/OverlapContext.py +++ b/OverlapContext.py @@ -2,8 +2,11 @@ class OverlapContext: - # Saves the data associated with the context overlapping with the - # variant. + """The OverlapContext saves an acceptor or donor context. + + The OverlapContext can be used to save an acceptor or donor context with reads that overlap with the variant + directly.""" + # Saves the data associated with the context overlapping with the variant. def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, ovconstart, ovconend, bamreads): self.context_id = variantid @@ -18,76 +21,96 @@ def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, # ===METHODS TO GET DATA OF THE OVERLAP CONTEXT============================ # Returns the context data (chrom, pos, start, end) in an array def get_context(self): + """Returns the essential data of the context. + + This data consists of the chromosome name the context is located on, the variant position the context is based + on and the leftmost (start) and rightmost (end) genomic position of the context.""" return [self.context_chrom, self.context_origin, self.context_start, self.context_end] # Returns the context identifier. def get_context_id(self): + """Returns the context identifier.""" return self.context_id # Returns the sample identifier the context was based on. def get_sample_id(self): + """Returns the identifier of the sample the context is """ return self.sample_id # Returns the context chromosome name the context is located on. def get_context_chrom(self): + """Returns the chromosome name of the context.""" return self.context_chrom # Returns the origin position of the context. def get_context_origin(self): + """Returns the the position of the variant that was used to construct the context.""" return self.context_origin # Returns the context start position. def get_context_start(self): + """Returns the left most genomic position of the context.""" return self.context_start # Returns the end position of the context. def get_context_end(self): + """Returns the right most genomic position of the context.""" return self.context_end - # Returns the bam reads associated with the context as a list of - # BamRead objects. + # Returns the bam reads associated with the context as a list of BamRead objects. def get_context_bam_reads(self): + """Returns the list of reads and their mates overlapping with the context. + + Mates of reads overlapping with the context might not overlap with the context themselves. Each read is returned + as a DonorBamRead object.""" return self.context_bam_reads # Returns the list of BAM read ids that have an unmapped mate. def get_unmapped_read_mate_ids(self): + """Returns a list with identifiers of reads that have an unmapped mate.""" return self.unmapped_read_mate_ids - # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE============= - # ===OVERLAP CONTEXT=== - # Returns the lengt of the context. + # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE OVERLAP CONTEXT============= + # Returns the length of the context. def get_context_length(self): + """Subtracts the left most genomic position of the context from the right most and returns the difference.""" return abs(self.context_end - self.context_start) # Returns the distance of the context start from the context origin. def get_start_distance_from_origin(self): + """Subtracts the left most position of the context from the variant position and returns the difference.""" return abs(self.context_origin - self.context_start) # Returns the distance of the context end from the context origin. def get_end_distance_from_origin(self): + """Subtracts the variant position from the rightmost position of the context and returns the difference.""" return abs(self.context_end - self.context_origin) # ===METHODS TO OBTAIN CONTEXT READ INFORMATION============================ # Returns the number of saved context reads. def get_number_of_context_reads(self): + """Returns the number of reads associated with the context.""" if self.context_bam_reads is None: return 0 return len(self.context_bam_reads) # Returns a list of BAM read identifiers in the current context. def get_context_bam_read_ids(self): + """Returns a list of the identifiers of reads associated with the context.""" if self.context_bam_reads is None: return [None] return list(set([x.get_bam_read_id() for x in self.context_bam_reads])) # Returns a list of all left positions for all BAM reads. def get_context_bam_read_starts(self): + """Returns a list of all let most position of all reads associated with the context.""" if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.context_bam_reads] # Returns a list of all left positions for all R1 BAM reads. def get_context_bam_read_left_positions(self): + """Returns a list with the left most positions for all forward/R1 reads associated with the context.""" if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_pos() @@ -95,12 +118,14 @@ def get_context_bam_read_left_positions(self): # Returns a list of BAM read ending positions for all BAM reads. def get_context_bam_read_ends(self): + """Returns a list with the rightmost genomic positions of all reads associated with the context.""" if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_end() for x in self.context_bam_reads] # Returns a list of all right positions for all R2 BAM reads. def get_context_bam_read_right_positions(self): + """Returns a list with the rightmost genomic positions for all reverse/R2 reads associated with the context.""" if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_end() @@ -108,10 +133,12 @@ def get_context_bam_read_right_positions(self): # Returns a list of all lengths for all BAM reads. def get_context_bam_read_lengths(self): + """Returns a list with the lengths of all reads associated with the context.""" return [x.get_bam_read_length() for x in self.context_bam_reads] # Returns a list of BAM read sequences in the current context. def get_context_bam_read_seqs(self): + """Returns a list with the sequences of all reads associated with the context.""" return [x.get_bam_read_sequence() for x in self.context_bam_reads] # Returns a list of qualities of all BAM reads. @@ -120,15 +147,18 @@ def get_context_bam_read_qualities(self): # Returns a list of Q-scores of all BAM reads. def get_context_bam_read_q_scores(self): + """Returns a list with the Q-Scores of all reads associated with the context. + + The Q-Scores for each read is an array of integers. The returned list is therefore a list with lists.""" return [x.get_bam_read_q_scores() for x in self.context_bam_reads] # Returns a list of all BAM read MapQ values. def get_context_bam_read_map_qs(self): return [x.get_mapping_qual() for x in self.context_bam_reads] - # Returns whether a BAM read is in the context based on the provided - # read identifier. + # Returns whether a BAM read is in the context based on the provided read identifier. def read_is_in_context(self, readid): + """Checks whether a read specified by an id is associated with the context.""" if self.context_bam_reads is None: return False return readid in self.get_context_bam_read_ids() @@ -136,19 +166,23 @@ def read_is_in_context(self, readid): # ===METHODS TO ADD/SET CONTEXT DATA======================================= # Adds the read id of a BAM read with an unmapped mate. def add_unmapped_mate_id(self, ureadid): + """Adds the identifier of a read with an unmapped mate to the context unmapped mates list.""" self.unmapped_read_mate_ids.append(ureadid) # Sets the list of unmapped read mate ids. def set_unmapped_mate_ids(self, mateids): + """Sets the provided read id list as the unmapped mate id list associated with the context.""" self.unmapped_read_mate_ids = mateids # Returns whether a BAM read in the context has an unmapped mate. def read_has_unmapped_mate(self, readid): + """Checks if a read specified by a read identifier has an unmapped mate and returns True or False.""" return readid in self.unmapped_read_mate_ids # ===STATISTICS METHODS FOR A VARIANT CONTEXT============================== # Returns the average and median read length. def get_average_and_median_read_length(self): + """Calculates and return the mean and median red length of all reads associated with the context.""" if self.context_bam_reads is None: return [None, None] avgmedlen = [] @@ -159,6 +193,7 @@ def get_average_and_median_read_length(self): # Returns the average and median read quality. def get_average_and_median_read_qual(self): + """Calculates and returns the mean and median Q-Score of all reads associated with the context.""" if self.context_bam_reads is None: return [None, None] avgmedqual = [] @@ -168,6 +203,7 @@ def get_average_and_median_read_qual(self): # Returns the average and median read MapQ of this variant context. def get_average_and_median_read_map_q(self): + """Calculates the mean and median MAPQ value of all reads associated witht the context.""" if self.context_bam_reads is None: return [None, None] avgmedmapq = [] @@ -178,6 +214,7 @@ def get_average_and_median_read_map_q(self): # ===SOME OTHER METHODS==================================================== # Returns a string representation of the overlap context. def to_string(self): + """Returns a String representation of the context.""" if self.context_bam_reads is None: bamids = None countbamreads = 0 @@ -195,6 +232,7 @@ def to_string(self): # Returns a statistics string representation of the overlap context. def to_statistics_string(self): + """Calculates some basic statistics of the context in a tab separated String.""" avgmedlens = self.get_average_and_median_read_length() avgmedquals = self.get_average_and_median_read_qual() avgmedmapq = self.get_average_and_median_read_map_q() @@ -202,9 +240,13 @@ def to_statistics_string(self): f"{avgmedquals[0]}\t{avgmedquals[1]}\t{avgmedmapq[0]}\t" f"{avgmedmapq[1]}") - # Compares the current OverlapContext to another OverlapContext and - # returns the differences. + # Compares the current OverlapContext to another OverlapContext and returns the differences. def compare(self, other_overlap_context): + """Compares the current context to a provided context and returns the differences. + + The context is compared to another context on each aspect. If they differ, the difference is saved in a + dictionary. The key is a numeric value, the difference an array. The first entry is the value of the current + context, the second entry is the value of the other context.""" differences = {} if self.context_id != other_overlap_context.get_context_id(): differences[1] = [self.context_id, diff --git a/ParamChecker.py b/ParamChecker.py index 811eec9..d59ca71 100644 --- a/ParamChecker.py +++ b/ParamChecker.py @@ -257,10 +257,12 @@ def get_valid_vcf_filelist(self): # Returns the list of valid BAM folders. def get_valid_bam_filelist(self): + """Returns the location of the text file containing a list of donor alignment files to use.""" return self.bam_filelist # Returns the location of the acceptor BAM file location. def get_acceptor_bam(self): + """Returns the location of the BAM/CRAM file that will be used as the acceptor.""" return self.acceptorbam # Returns the location and name of the first (R1) fastq input file location. @@ -277,12 +279,14 @@ def get_fastq_in_locations(self): # Returns the location to write the output to. def get_out_dir_location(self): + """Returns the directory to write the output files. Adds '/' if not present in the ouput directory location.""" if self.outdir.endswith("/"): return self.outdir return self.outdir + "/" # Returns the location of the reference fasta file def get_reference_file_location(self): + """Returns the location of the genomic reference file.""" return self.reference_file # Returns the location of the FastQ file that will be produced by VaSeBuilder. @@ -299,6 +303,7 @@ def get_variant_context_out_location(self): # Retuns the location to write the log file(s) to. def get_log_file_location(self): + """Returns the location the log file will be written to.""" return self.log_location # Returns the variant list location @@ -311,10 +316,12 @@ def get_donorfqlist(self): # Returns the specified runmmode def get_runmode(self): + """Returns the value of the runmode parameter.""" return self.runmode # Returns the location of the variant context in file def get_variantcontext_infile(self): + """Returns the saved value of the varconin CLI parameter.""" return self.varconin # Returns the list of required parameters for a specified runmode diff --git a/ReadIdObject.py b/ReadIdObject.py index cc79f7e..b935070 100644 --- a/ReadIdObject.py +++ b/ReadIdObject.py @@ -1,20 +1,36 @@ class ReadIdObject: + """The ReadIdObject is used to save the read identifiers. + + This class is used when a VariantContextFile is read. DonorBamRead can't be used as most data is missing.""" def __init__(self, readid): + """Constructor saves the provided read identifier and sets the pairnumber to a default empty value.""" self.read_id = readid self.pair_number = "" # Returns the name/identifier of the read id object def get_bam_read_id(self): + """Returns the identifier of the read.""" return self.read_id # Sets the name/identifier of the read id object def set_bam_read_id(self, readid): + """Sets the identifier of a read.""" self.read_id = readid # Return the pair number of the read id object def get_bam_read_pair_number(self): + """Returns the read pair number of the read.""" return self.pair_number # Sets the pair number of the read id object def set_bam_read_pair_number(self, pairnum): + """Sets the read pair number of the read.""" self.pair_number = pairnum + + def is_read1(self): + """Returns whether the read is the forward/R1 read.""" + return self.pair_number == "1" + + def is_read2(self): + """Returns whether the read is the reverse/R2 read.""" + return self.pair_number == "2" diff --git a/VcfBamScanner.py b/VcfBamScanner.py index e47bb90..85bff65 100644 --- a/VcfBamScanner.py +++ b/VcfBamScanner.py @@ -6,8 +6,14 @@ class VcfBamScanner: + """VcfBamScanner offers functionality to scan variant (VCF/BCF) and alignment (BAM/CRAM) files. + + The VcfBamScanner can be used to scan folders for variant and alignments files, check whether variant and alignment + files have sample names and extract them. A list file containing paths to either variant or alignment files can + also be.scanned. Files within the list files will be a saved per sample name in an identifier.""" # Constructor that creates two empty hashmaps (dicts). def __init__(self): + """Obtains the vase logger and creates two empty dictionaries to save the variant and alignment files.""" self.vaselogger = logging.getLogger("VaSe_Logger") self.vcf_sample_map = {} self.bam_sample_map = {} @@ -16,14 +22,20 @@ def __init__(self): # ===METHODS TO GET SAVED DATA FROM THE VCFBAMSCANNER====================== # Returns the map that links VCF files and samples. def get_vcf_sample_map(self): + """Returns a dictionary with VCF/BCF files saved per sample.""" return self.vcf_sample_map # Returns the map that links BAM files and samples. def get_bam_sample_map(self): + """Returns a dictionary with BAM/CRAM files saved per sample.""" return self.bam_sample_map # Returns a map that links VCF files to BAM files. def get_vcf_to_bam_map(self): + """Returns a dictionary with a variant file linked to an alignment file. + + For each sample, the associated variant file is linked to the associated.alignment file are linked via a + dictionary. The variant and alignment file are saved as key and value respectively.""" vcf_to_bam_map = {} for sampleid in self.vcf_sample_map: if sampleid in self.bam_sample_map: @@ -34,6 +46,11 @@ def get_vcf_to_bam_map(self): # Scans the folders containing VCF files and returns a map that # links sample ids with vcf files. def scan_vcf_folders(self, vcffolders): + """Scans a list of folders containing variant files and returns a dictionary with valid files per sample name. + + Iterates over a list of paths to folders containing VCF/BCF files. A non-existing folder is skipped. For each + variant file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a + dictionary.""" self.vaselogger.info("Start scannig VCF files") for vcffolder in vcffolders: @@ -68,9 +85,13 @@ def scan_vcf_folders(self, vcffolders): self.vaselogger.info("Finished scanning VCF files") return self.vcf_sample_map - # Scans the folders containing BAM files and returns a map that - # links sample ids with bam files. + # Scans the folders containing BAM files and returns a map that links sample ids with bam files. def scan_bam_folders(self, bamfolders): + """Scans a list of folders containing alignment files and returns a dictionary with valid files per sample name. + + Iterates over a list of paths to folders containing BAM/CRAM files. Non-existing folders are skipped. For each + alignment file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a + dictionary.""" self.vaselogger.info("Start scanning BAM files") # Scan BAM files in all provided folders. @@ -107,11 +128,13 @@ def scan_bam_folders(self, bamfolders): # Scans the provided BAM/CRAM files from a provided donor list file def scan_bamcram_files(self, bamcramlistfile): + """Reads a list file containing the locations of BAM/CRAM files and saves them per sample.""" self.bam_sample_map = self.read_donorlistfile(bamcramlistfile) return self.bam_sample_map # Scan the provided VCF files from a provided donor list file def scan_vcf_files(self, vcflistfile): + """Reads a list file containing the locations of VCF/BCF files and saves them per sample name.""" self.vcf_sample_map = self.read_donorlistfile(vcflistfile, "v") return self.vcf_sample_map @@ -137,6 +160,7 @@ def read_donorlistfile(self, listfileloc, listtype="a"): # Checks whether the BAM file contains a sample name. def bam_has_sample_name(self, bamfile): + """Checks and returns whether a BAM/CRAM file has a sample name/identifier.""" if "RG" in bamfile.header: if len(bamfile.header["RG"]) > 0: if "SM" in bamfile.header["RG"][0]: @@ -145,12 +169,14 @@ def bam_has_sample_name(self, bamfile): # Checks whether the VCF file has a sample name def vcf_has_sample_name(self, vcffile): + """Checks and returns whether the VCF/BCF file has a sample name/identifier.""" if vcffile.header.samples[0] != "": return True return False # Returns the first sample identifier of the VCF file (as for now we only work with one sample per file def get_vcf_sample_name(self, vcffileloc): + """Extracts and returns the list of sample names from a specified VCF file.""" vcffile = pysam.VariantFile(vcffileloc, "r") if self.vcf_has_sample_name(vcffile): return list(vcffile.header.samples) @@ -158,6 +184,11 @@ def get_vcf_sample_name(self, vcffileloc): # Returns the BAM sample def get_bam_sample_name(self, bamfileloc): + """Extracts the sample name/identifier from a BAM file. + + The sample name is extracted by first checking whether the BAM file contains a sample name via the + VcfBamScanner method bam_has_sample_name(). If so, the sample name is extracted from the BAM file by returning + the value of the 'SM' field from the header line starting with '@RG'.""" try: bamfile = pysam.AlignmentFile(bamfileloc, "rb") if self.bam_has_sample_name(bamfile): @@ -170,6 +201,12 @@ def get_bam_sample_name(self, bamfileloc): # Returns the CRAM sample name def get_cram_sample_name(self, cramfileloc): + """Extracts the sample name/identifier from a specified CRAM file. + + The sample name is extracted by first checking whether the CRAM file contains a sample name via the + VcfBamScanner method cram_has_sample_name(). If so, the sample name is extracted from the CRAM file by + returning the value of the 'SM' field from the header line starting with '@RG'. None is returned if the CRAM + file has no sample name.""" try: cramfile = pysam.AlignmentFile(cramfileloc, "rc") if self.bam_has_sample_name(cramfile): @@ -181,6 +218,7 @@ def get_cram_sample_name(self, cramfileloc): # General methods that returns one or more sample ids (in case of a VCF) def get_sample_id(self, donorfileloc, donorlisttype): + """Returns the sample name/identifier for a specified alignemt or variant file.""" donor_file_type = self.get_donor_file_type(donorfileloc) if donorlisttype == "a": if donor_file_type[1] == "BAM": @@ -193,16 +231,25 @@ def get_sample_id(self, donorfileloc, donorlisttype): # Returns the filetype of a donor file def get_donor_file_type(self, donorfileloc): + """Determines and returns the file type of a specified donor file. + + To determine the file type, the linux command 'file' is used which returns a String containing the file type + info. The received String is split on spaces and the resulting list is returned.""" filetype_proc = subprocess.Popen(["file", "-b", "-z", donorfileloc], stdout=subprocess.PIPE) filetype_data, filetype_err = filetype_proc.communicate() return filetype_data.decode().split(" ") # Returns a list of sample identifiers that have both a VCF and a BAM/CRAM def get_complete_sample_ids(self): + """Determines which samples have a variant and alignment file and the list of identifiers.""" return set(self.vcf_sample_map.keys()) & set(self.bam_sample_map.keys()) # Extracts the sequence names from an already opened BAM/CRAM file def get_alignment_sequence_names(self, alignment_file): + """Extracts and returns the chromosome names from an alignment file. + + Chromosome names are extracted from a specified BAM or CRAM file by checking the headers for the 'SQ' tag. If + present the SQ entry is checked for the 'SN' field which contains the specific chromosome names.""" sequence_names = set() if "SQ" in alignment_file.header: for sn_entry in alignment_file.header["SQ"]: @@ -211,6 +258,10 @@ def get_alignment_sequence_names(self, alignment_file): # Extracts the sequence names from an already opened VCF/BCF file def get_variant_sequence_names(self, variant_file): + """Extracts and returns the chromosome names from a variant file. + + Chromosome names are extracted from a specified VCF or BCF file by obtaining the header contig names via the + pysam method .header.contigs().""" sequence_names = set() if len(variant_file.header.contigs) > 0: for seqname in list(variant_file.header.contigs): From 8ccc78255d9afa5f32bd01531d29d47a043fa2e9 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 4 Sep 2019 17:07:54 +0200 Subject: [PATCH 17/90] Add more docstrings, start changing existing docstrings to numpy style --- DonorBamRead.py | 77 +++++++++++++++++++++++++++++++++++++++++++----- VcfBamScanner.py | 3 +- vase.py | 30 +++++++++++++++++++ 3 files changed, 102 insertions(+), 8 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 3252a17..c481957 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -8,6 +8,27 @@ class DonorBamRead: # Saves the required BAM read data. def __init__(self, readid, readpn, readchrom, readstart, readlen, readseq, readquals, mapqual): + """Saves all required data of an aligned read from a BAM or CRAM file. + + Parameters + ---------- + readid : str + The read identifier + readpn : str + The read pair number + readchrom : str + The chromosome name the read is located on + readstart : int + The leftmost genomic position of the mapped read + readlen : int + The total length of the read + readseq : str + The read sequence + reqdquals : str + The qualities of the read in ascii + mapqual : int + The MAPQ mapping quality + """ self.bam_read_id = readid self.bam_read_pairnum = readpn self.bam_read_chrom = readchrom @@ -20,29 +41,59 @@ def __init__(self, readid, readpn, readchrom, readstart, # ===METHODS TO GET SAVED DATA FROM THE DONORBAMREAD======================= # Returns the BAM read identifier. def get_bam_read_id(self): - """Returns the identifier of the read.""" + """Returns the identifier of the read. + + Returns + ------- + str + The read identifier + """ return self.bam_read_id # Returns the BAM read pair number (1 or 2). def get_bam_read_pair_number(self): """Returns the pair number of the read. - This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads""" + This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads. + + Returns + ------- + str + The read pair number + """ return self.bam_read_pairnum # Returns the BAM read chromosome. def get_bam_read_chrom(self): - """Returns the name of the chromosome where the read has been mapped.""" + """Returns the name of the chromosome where the read has been mapped. + + Returns + ------- + str + The chromosome name where the read is mapped + """ return self.bam_read_chrom # Returns the BAM read starting position on the reference sequence. def get_bam_read_ref_pos(self): - """Returns the genomic mapping position (the left most position) of the read.""" + """Returns the genomic mapping position (the left most position) of the read. + + Returns + ------- + int + The read leftmost genomic position + """ return self.bam_read_ref_pos # Returns the BAM read length. def get_bam_read_length(self): """Returns the length of the read. In VaSeBuilder this length is calculated by means of the CIGAR string by - using the pysam method infer_read_length()""" + using the pysam method infer_read_length() + + Returns + ------- + int + The length of the read + """ return self.bam_read_length # Returns the BAM read ending position on the reference (calculated as starting position + the length of the read). @@ -54,7 +105,13 @@ def get_bam_read_ref_end(self): # Returns the BAM read sequence. def get_bam_read_sequence(self): - """Returns the read sequence as a String.""" + """Returns the read sequence as a String. + + Returns + ------- + str + The sequence of the read + """ return self.bam_read_seq # Returns the BAM read quality scores. @@ -118,7 +175,13 @@ def get_as_fastq_seq(self, addpairnum=False): """Returns the read as a FastQ entry. If the 'addpairnum' argument is set to True the read pair number will be added at the end of the read - identifier as '/1' or '/2', depending on whether the read is the forward/R1 or reverse/R2 read.""" + identifier as '/1' or '/2', depending on whether the read is the forward/R1 or reverse/R2 read. + + Returns + ------- + str + The + """ if addpairnum: return ("@" + str(self.bam_read_id) + "/" + str(self.bam_read_pairnum) + "\n" + str(self.bam_read_seq) + "\n" diff --git a/VcfBamScanner.py b/VcfBamScanner.py index 85bff65..8dc1b55 100644 --- a/VcfBamScanner.py +++ b/VcfBamScanner.py @@ -188,7 +188,8 @@ def get_bam_sample_name(self, bamfileloc): The sample name is extracted by first checking whether the BAM file contains a sample name via the VcfBamScanner method bam_has_sample_name(). If so, the sample name is extracted from the BAM file by returning - the value of the 'SM' field from the header line starting with '@RG'.""" + the value of the 'SM' field from the header line starting with '@RG'. None is returned if the BAM file has no + sample name.""" try: bamfile = pysam.AlignmentFile(bamfileloc, "rb") if self.bam_has_sample_name(bamfile): diff --git a/vase.py b/vase.py index 753aa6a..99789f7 100644 --- a/vase.py +++ b/vase.py @@ -18,6 +18,11 @@ class VaSe: # Performs the check that VaSe is run with Python 3.x def __init__(self): + """Checks both the python and pysam version. + + If the python version is not at least 3.6 or higher the program will not run as f-strings, that are used in this + program were not available in older versions. The program also requires pysam 0.15 or higher as some pysma + functions used in VaseBuilder are only available since pysam 0.15.""" assert (sys.version_info[0] >= 3 and sys.version_info[1] >= 6), "Please run this program in Python 3.6 or " \ "higher" assert (int(pysam.version.__version__.split(".")[0]) >= 0 and int(pysam.version.__version__.split(".")[1]) >= @@ -74,6 +79,7 @@ def main(self): # Method that creates the logger that will write the log to stdout and a log file. def start_logger(self, paramcheck, logloc, debug_mode=False): + """Starts and returns the logger VaSe_Logger""" vaselogger = logging.getLogger("VaSe_Logger") if debug_mode: vaselogger.setLevel(logging.DEBUG) @@ -106,6 +112,7 @@ def start_logger(self, paramcheck, logloc, debug_mode=False): # Returns the vase Parameters. def get_vase_parameters(self): + """Creates a command line argument parser and returns the parameter values.""" # Set the VaSe parameters for the program. vase_argpars = argparse.ArgumentParser() vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes, @@ -144,6 +151,21 @@ def get_vase_parameters(self): # Reads the variant list. Assumes that sampleId, chromosome and startpos are columns 1,2 and 3 def read_variant_list(self, variantlistloc): + """Reads a file containing genomic variants and returns them in a dictionary. + + The file containing the variant is expected to have at least three columns separated by tabs. These should be, + in order: sample name, chromosome name, chromosomal position. + + Parameters + ---------- + variantlistloc : str + The location of the file containing variants + + Returns + ------- + dict + Read variants per sample name + """ variant_filter_list = {} try: with open(variantlistloc) as variantlistfile: @@ -160,6 +182,11 @@ def read_variant_list(self, variantlistloc): # Reads the config file with the settings def read_config_file(self, configfileloc): + """Reads a VaSeBuilder configuration file and returns the parameter values. + + :param configfileloc: + :return: + """ debug_param_vals = ["True", "1", "T"] configdata = {} try: @@ -198,6 +225,9 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc): return donor_fastqs def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): + """Selects and runs the selected runmode. + + """ vbscan = VcfBamScanner() varconfile = None # Declare the varconfile variable so we can use it in C and no-C. From 577a7382cc9d40324470788d749b671535c50bed Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 6 Sep 2019 13:48:27 +0200 Subject: [PATCH 18/90] More docstrings added --- DonorBamRead.py | 86 ++++++++++--- OverlapContext.py | 280 +++++++++++++++++++++++++++++++++++++----- VaSeBuilder.py | 11 ++ VariantContext.py | 42 +++++++ VariantContextFile.py | 258 +++++++++++++++++++++++++++++++++++++- VcfBamScanner.py | 179 ++++++++++++++++++++++++--- 6 files changed, 792 insertions(+), 64 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index c481957..3f75eb2 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -45,7 +45,7 @@ def get_bam_read_id(self): Returns ------- - str + bam_read_id : str The read identifier """ return self.bam_read_id @@ -57,7 +57,7 @@ def get_bam_read_pair_number(self): Returns ------- - str + bam_read_pairnum : str The read pair number """ return self.bam_read_pairnum @@ -68,7 +68,7 @@ def get_bam_read_chrom(self): Returns ------- - str + bam_read_chrom : str The chromosome name where the read is mapped """ return self.bam_read_chrom @@ -79,7 +79,7 @@ def get_bam_read_ref_pos(self): Returns ------- - int + bam_read_ref_pos : int The read leftmost genomic position """ return self.bam_read_ref_pos @@ -91,14 +91,20 @@ def get_bam_read_length(self): Returns ------- - int + bam_read_length : int The length of the read """ return self.bam_read_length # Returns the BAM read ending position on the reference (calculated as starting position + the length of the read). def get_bam_read_ref_end(self): - """Returns the """ + """Calculates and returns the rightmost genomic position of the read via leftmost position + read length. + + Returns + ------- + bam_read_length : int + The rightmost position of the read + """ if self.bam_read_length is not None: return self.bam_read_ref_pos + self.bam_read_length return -1 @@ -109,20 +115,32 @@ def get_bam_read_sequence(self): Returns ------- - str + bam_read_seq : str The sequence of the read """ return self.bam_read_seq # Returns the BAM read quality scores. def get_bam_read_qual(self): - """Returns the """ + """Returns the read quality. + + Returns + ------- + bam_read_qual : str + Read quality + """ return self.bam_read_qual # Returns the BAM read quality as an array of Q-Scores. def get_bam_read_q_scores(self): """Converts the String of ASCII quality scores to a list Q-Scores. The Q-Score for each ASCII quality character - are calculated by obtaining the unicode code point and subtracting 33.""" + are calculated by obtaining the unicode code point and subtracting 33. + + Returns + ------- + qscores : list + Q-Score of the read + """ qscores = [] for qualSymbol in self.bam_read_qual: qscores.append(ord(qualSymbol)-33) @@ -130,37 +148,73 @@ def get_bam_read_q_scores(self): # Returns the maping quality of the BAM read. def get_mapping_qual(self): - """Returns the mapping quality that was assigned to the read.""" + """Returns the mapping quality that was assigned to the read. + + Returns + ------- + bam_read_map_qual : int + The mapping quality (MAPQ) + """ return self.bam_read_map_qual # ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD=================== # Returns the average Q-Score. def get_average_qscore(self): - """Calculates and returns the mean Q-Score of the read.""" + """Calculates and returns the mean Q-Score of the read. + + Returns + ------- + float + The mean Q-Score + """ qscores = self.get_bam_read_q_scores() return statistics.mean(qscores) # Returns the median Q-Score. def get_median_qscore(self): - """Calculates and returns the median Q-Score of the read.""" + """Calculates and returns the median Q-Score of the read. + + Returns + ------- + int + The median Q-Score of the read + """ qscores = self.get_bam_read_q_scores() return statistics.median(qscores) # ===METHODS TO CHECK WHETHER THE BAM READ IS R1 OR R2===================== # Returns if the BAM read is the first (forward) read. def is_read1(self): - """Returns whether the read is the forward/R1 read by checking if the pair number is set to '1'.""" + """Returns whether the read is the forward/R1 read by checking if the pair number is set to '1'. + + Returns + ------- + bool + True if the read has pair number 1, otherwise False + """ return self.bam_read_pairnum == "1" # Returns if the BAM read is the second (reverse) read. def is_read2(self): - """Returns whether the read is the reverse/R2 read by checking if the pair number is set to '2'.""" + """Returns whether the read is the reverse/R2 read by checking if the pair number is set to '2'. + + Returns + ------- + bool + True if the read has pair number 2, otherwise False + """ return self.bam_read_pairnum == "2" # ===METHODS TO RETURN A STRING REPRESENTATION OF THE DONORBAMREAD OBJECT== # Returns a String representation. def to_string(self): - """Returns a String with all the saved data, separated by tabs, of the read.""" + """Returns a String with all the saved data, separated by tabs, of the read. + + Returns + ------- + str + String representation of the read. + """ return (str(self.bam_read_id) + "\t" + str(self.bam_read_pairnum) + "\t" + str(self.bam_read_chrom) + "\t" @@ -180,7 +234,7 @@ def get_as_fastq_seq(self, addpairnum=False): Returns ------- str - The + The read as FastQ entry """ if addpairnum: return ("@" + str(self.bam_read_id) + "/" + str(self.bam_read_pairnum) + "\n" diff --git a/OverlapContext.py b/OverlapContext.py index 18583b0..eca06ae 100644 --- a/OverlapContext.py +++ b/OverlapContext.py @@ -5,7 +5,27 @@ class OverlapContext: """The OverlapContext saves an acceptor or donor context. The OverlapContext can be used to save an acceptor or donor context with reads that overlap with the variant - directly.""" + directly. + + Attributes + ---------- + context_id : str + Identifier of the context + sample_id : str + Sample name/identifier the context was derived from + context_chrom : str + The chromosome name that the context is located on + context_origin : int + The variant genomic position that constructed the context + context_start : int + The leftmost genomic position of the context + context_end : int + The rightmost genomic position of the context + context_bam_reads : list of str + List of aligned reads + unmapped_read_mate_ids : list of str + List of read identifiers that have unmapped mates + """ # Saves the data associated with the context overlapping with the variant. def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, ovconstart, ovconend, bamreads): @@ -24,37 +44,79 @@ def get_context(self): """Returns the essential data of the context. This data consists of the chromosome name the context is located on, the variant position the context is based - on and the leftmost (start) and rightmost (end) genomic position of the context.""" + on and the leftmost (start) and rightmost (end) genomic position of the context. + + Returns + ------- + list + List of essential context data + """ return [self.context_chrom, self.context_origin, self.context_start, self.context_end] # Returns the context identifier. def get_context_id(self): - """Returns the context identifier.""" + """Returns the context identifier. + + Returns + ------- + self.context_id : str + The context identifier + """ return self.context_id # Returns the sample identifier the context was based on. def get_sample_id(self): - """Returns the identifier of the sample the context is """ + """Returns the identifier of the sample the context is + + Returns + ------- + self.sample_id : str + The sample name/identifier + """ return self.sample_id # Returns the context chromosome name the context is located on. def get_context_chrom(self): - """Returns the chromosome name of the context.""" + """Returns the chromosome name that the context is located on. + + Returns + ------- + self.context_chrom : str + The context chromosome name + """ return self.context_chrom # Returns the origin position of the context. def get_context_origin(self): - """Returns the the position of the variant that was used to construct the context.""" + """Returns the the position of the variant that was used to construct the context. + + Returns + ------- + self.context_origin : int + Variant position the context is based on + """ return self.context_origin # Returns the context start position. def get_context_start(self): - """Returns the left most genomic position of the context.""" + """Returns the left most genomic position of the context. + + Returns + ------- + self.context_start : int + Context starting position + """ return self.context_start # Returns the end position of the context. def get_context_end(self): - """Returns the right most genomic position of the context.""" + """Returns the right most genomic position of the context. + + Returns + ------- + self.context_end : int + Context ending position + """ return self.context_end # Returns the bam reads associated with the context as a list of BamRead objects. @@ -67,50 +129,98 @@ def get_context_bam_reads(self): # Returns the list of BAM read ids that have an unmapped mate. def get_unmapped_read_mate_ids(self): - """Returns a list with identifiers of reads that have an unmapped mate.""" + """Returns a list with identifiers of reads that have an unmapped mate. + + Returns + ------- + self.unmapped_read_mate_ids : list of str + List with read identifiers that have unmapped mates + """ return self.unmapped_read_mate_ids # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE OVERLAP CONTEXT============= # Returns the length of the context. def get_context_length(self): - """Subtracts the left most genomic position of the context from the right most and returns the difference.""" + """Subtracts the left most genomic position of the context from the right most and returns the difference. + + Returns + ------- + int + Context length + """ return abs(self.context_end - self.context_start) # Returns the distance of the context start from the context origin. def get_start_distance_from_origin(self): - """Subtracts the left most position of the context from the variant position and returns the difference.""" + """Subtracts the left most position of the context from the variant position and returns the difference. + + Returns + ------- + int + Context start distance from variant position + """ return abs(self.context_origin - self.context_start) # Returns the distance of the context end from the context origin. def get_end_distance_from_origin(self): - """Subtracts the variant position from the rightmost position of the context and returns the difference.""" + """Subtracts the variant position from the rightmost position of the context and returns the difference. + + Returns + ------- + int + Context end distance form variant position + """ return abs(self.context_end - self.context_origin) # ===METHODS TO OBTAIN CONTEXT READ INFORMATION============================ # Returns the number of saved context reads. def get_number_of_context_reads(self): - """Returns the number of reads associated with the context.""" + """Returns the number of reads associated with the context. + + Returns + ------- + int + Number of context reads + """ if self.context_bam_reads is None: return 0 return len(self.context_bam_reads) # Returns a list of BAM read identifiers in the current context. def get_context_bam_read_ids(self): - """Returns a list of the identifiers of reads associated with the context.""" + """Returns a list of the identifiers of reads associated with the context. + + Returns + ------- + list of str + Identifiers of all reads + """ if self.context_bam_reads is None: return [None] return list(set([x.get_bam_read_id() for x in self.context_bam_reads])) # Returns a list of all left positions for all BAM reads. def get_context_bam_read_starts(self): - """Returns a list of all let most position of all reads associated with the context.""" + """Returns a list of all let most position of all reads associated with the context. + + Returns + ------- + list of int or None + Leftmost positions of all reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.context_bam_reads] # Returns a list of all left positions for all R1 BAM reads. def get_context_bam_read_left_positions(self): - """Returns a list with the left most positions for all forward/R1 reads associated with the context.""" + """Returns a list with the left most positions for all forward/R1 reads associated with the context. + + Returns + ------- + list of int or None + Leftmost genomic positions of all R1 reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_pos() @@ -118,14 +228,26 @@ def get_context_bam_read_left_positions(self): # Returns a list of BAM read ending positions for all BAM reads. def get_context_bam_read_ends(self): - """Returns a list with the rightmost genomic positions of all reads associated with the context.""" + """Returns a list with the rightmost genomic positions of all reads associated with the context. + + Returns + ------- + list of int or None + Rightmost genomic positions of all reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_end() for x in self.context_bam_reads] # Returns a list of all right positions for all R2 BAM reads. def get_context_bam_read_right_positions(self): - """Returns a list with the rightmost genomic positions for all reverse/R2 reads associated with the context.""" + """Returns a list with the rightmost genomic positions for all reverse/R2 reads associated with the context. + + Returns + ------- + list of int + Rightmost genomic positions of all reverse/R2 reads + """ if self.context_bam_reads is None: return [None] return [x.get_bam_read_ref_end() @@ -133,32 +255,71 @@ def get_context_bam_read_right_positions(self): # Returns a list of all lengths for all BAM reads. def get_context_bam_read_lengths(self): - """Returns a list with the lengths of all reads associated with the context.""" + """Returns a list with the lengths of all reads associated with the context. + + Returns + ------- + list of int + Lengths of all reads + """ return [x.get_bam_read_length() for x in self.context_bam_reads] # Returns a list of BAM read sequences in the current context. def get_context_bam_read_seqs(self): - """Returns a list with the sequences of all reads associated with the context.""" + """Returns a list with the sequences of all reads associated with the context. + + Returns + ------- + list of str + Sequences of all reads + """ return [x.get_bam_read_sequence() for x in self.context_bam_reads] # Returns a list of qualities of all BAM reads. def get_context_bam_read_qualities(self): + """Returns a list with the qualities lines of all reads associated with the context. + + Return + ------ + list of str + Quality line for all reads + """ return [x.get_bam_read_qual() for x in self.context_bam_reads] # Returns a list of Q-scores of all BAM reads. def get_context_bam_read_q_scores(self): """Returns a list with the Q-Scores of all reads associated with the context. - The Q-Scores for each read is an array of integers. The returned list is therefore a list with lists.""" + The Q-Scores for each read is an array of integers. The returned list is therefore a list with lists. + + + Returns + ------- + list + Lists of Q-Scores for all reads + """ return [x.get_bam_read_q_scores() for x in self.context_bam_reads] # Returns a list of all BAM read MapQ values. def get_context_bam_read_map_qs(self): + """Collects and returns the list of MAPQ values of all reads. + + Returns + ------- + list of int + List with MAPQ values + """ return [x.get_mapping_qual() for x in self.context_bam_reads] # Returns whether a BAM read is in the context based on the provided read identifier. def read_is_in_context(self, readid): - """Checks whether a read specified by an id is associated with the context.""" + """Checks whether a read specified by an id is associated with the context. + + Returns + ------- + bool + True if the read in the context, False if not + """ if self.context_bam_reads is None: return False return readid in self.get_context_bam_read_ids() @@ -166,23 +327,47 @@ def read_is_in_context(self, readid): # ===METHODS TO ADD/SET CONTEXT DATA======================================= # Adds the read id of a BAM read with an unmapped mate. def add_unmapped_mate_id(self, ureadid): - """Adds the identifier of a read with an unmapped mate to the context unmapped mates list.""" + """Adds the identifier of a read with an unmapped mate to the context unmapped mates list. + + Parameters + ------- + ureadid : str + Identifier of the read with an unmapped mate + """ self.unmapped_read_mate_ids.append(ureadid) # Sets the list of unmapped read mate ids. def set_unmapped_mate_ids(self, mateids): - """Sets the provided read id list as the unmapped mate id list associated with the context.""" + """Sets the provided read id list as the unmapped mate id list associated with the context. + + Parameters + ---------- + mateids : list of str + Read identifiers with unmapped mates + """ self.unmapped_read_mate_ids = mateids # Returns whether a BAM read in the context has an unmapped mate. def read_has_unmapped_mate(self, readid): - """Checks if a read specified by a read identifier has an unmapped mate and returns True or False.""" + """Checks if a read specified by a read identifier has an unmapped mate and returns True or False. + + Returns + ------- + bool + True if read has unmapped mate, False if not + """ return readid in self.unmapped_read_mate_ids # ===STATISTICS METHODS FOR A VARIANT CONTEXT============================== # Returns the average and median read length. def get_average_and_median_read_length(self): - """Calculates and return the mean and median red length of all reads associated with the context.""" + """Calculates and return the mean and median red length of all reads associated with the context. + + Returns + ------- + list of int + Mean and median read length + """ if self.context_bam_reads is None: return [None, None] avgmedlen = [] @@ -193,7 +378,13 @@ def get_average_and_median_read_length(self): # Returns the average and median read quality. def get_average_and_median_read_qual(self): - """Calculates and returns the mean and median Q-Score of all reads associated with the context.""" + """Calculates and returns the mean and median Q-Score of all reads associated with the context. + + Returns + ------- + list of int + Mean and median read Q-Score + """ if self.context_bam_reads is None: return [None, None] avgmedqual = [] @@ -203,7 +394,13 @@ def get_average_and_median_read_qual(self): # Returns the average and median read MapQ of this variant context. def get_average_and_median_read_map_q(self): - """Calculates the mean and median MAPQ value of all reads associated witht the context.""" + """Calculates the mean and median MAPQ value of all reads associated witht the context. + + Returns + ------- + list of int + Mean and median read MAPQ + """ if self.context_bam_reads is None: return [None, None] avgmedmapq = [] @@ -214,7 +411,13 @@ def get_average_and_median_read_map_q(self): # ===SOME OTHER METHODS==================================================== # Returns a string representation of the overlap context. def to_string(self): - """Returns a String representation of the context.""" + """Assembles and returns a String representation of the context. + + Returns + ------- + str + String representation of the context + """ if self.context_bam_reads is None: bamids = None countbamreads = 0 @@ -232,7 +435,13 @@ def to_string(self): # Returns a statistics string representation of the overlap context. def to_statistics_string(self): - """Calculates some basic statistics of the context in a tab separated String.""" + """Calculates some basic statistics of the context in a tab separated String. + + Returns + ------- + str + Line with statistics to write to file + """ avgmedlens = self.get_average_and_median_read_length() avgmedquals = self.get_average_and_median_read_qual() avgmedmapq = self.get_average_and_median_read_map_q() @@ -246,7 +455,16 @@ def compare(self, other_overlap_context): The context is compared to another context on each aspect. If they differ, the difference is saved in a dictionary. The key is a numeric value, the difference an array. The first entry is the value of the current - context, the second entry is the value of the other context.""" + context, the second entry is the value of the other context. + + Parameters + ---------- + other_overlap_context + + Returns + ------- + differences : dict + """ differences = {} if self.context_id != other_overlap_context.get_context_id(): differences[1] = [self.context_id, diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 8d2bed8..e0f4399 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -17,6 +17,17 @@ class VaSeBuilder: + """Creates the variant contexts, builds validation sets and writes the output files. + + Attributes + ---------- + vb_scanner + creation_id : str + Unique (uuid) identifier to identify VaSeBuilder runs + creation_time : str + The date and time of creation to identify VaSeBuilder runs + vaselogger + """ # Constructor that saves the identifier, date, and time of the current run. def __init__(self, vaseid): self.vaselogger = logging.getLogger("VaSe_Logger") diff --git a/VariantContext.py b/VariantContext.py index 1c1a3e0..11c5f94 100644 --- a/VariantContext.py +++ b/VariantContext.py @@ -3,6 +3,12 @@ class VariantContext: + """Saves a variant context and allows data to be obtained and calculated. + + Attributes + ---------- + """ + # Sets the variant context data. def __init__(self, varconid, sampleid, varconchrom, varconorigin, @@ -29,22 +35,51 @@ def __init__(self, varconid, sampleid, # ===METHODS TO OBTAIN DATA FROM THE VARIANT CONTEXT DATA================== # Returns the variant context identifier. def get_variant_context_id(self): + """Returns the variant context identifier. + + Returns + ------- + self.context_id : str + Variant context identifier + """ return self.context_id # Returns the variant context sample. def get_variant_context_sample(self): + """Returns the identifier of the sample + + Returns + ------- + self.sample_id : str + Sample identifier + """ return self.sample_id # Returns the variant context chromosome. def get_variant_context_chrom(self): + """""" return self.variant_context_chrom # Returns the variant context origin. def get_variant_context_origin(self): + """Returns the variant position that + + Returns + ------- + self.variant_context_origin : int + Variant genomic position + """ return self.variant_context_origin # Returns the variant context start position. def get_variant_context_start(self): + """Returns the leftmost genomic position of the variant context. + + Returns + ------- + self.variant_context_start + Variant context starting position + """ return self.variant_context_start # Returns the variant context end position. @@ -475,6 +510,13 @@ def to_string(self): # Returns a varconstats.txt string representation of the variant context. def to_statistics_string(self): + """Returns a String with basic variant context statistics. + + Returns + ------- + str + Basic variant context statistics + """ areadlen = self.get_average_and_median_acceptor_read_length() dreadlen = self.get_average_and_median_donor_read_length() areadqual = self.get_average_and_median_acceptor_read_qual() diff --git a/VariantContextFile.py b/VariantContextFile.py index b95e848..915f5de 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -5,6 +5,21 @@ class VariantContextFile: + """The VariantContextFile saves variant contexts + + Attributes + ------- + vaselogger + The logger displaying information of activities + variant_context_file_location : str + THe location of a read variant context file + variant_contexts : dict + Saves variant contexts by context identifier + variant_context_statistics + varcon_fields : dict + Fields + """ + def __init__(self, fileloc=None, samplefilter=None, varconfilter=None, chromfilter=None): self.vaselogger = logging.getLogger("VaSe_Logger") @@ -32,48 +47,117 @@ def __init__(self, fileloc=None, samplefilter=None, # ===METHODS TO GET DATA FROM THE VARIANT CONTEXT FILE===================== def get_variant_contexts(self, asdict=False): + """Returns the variant contexts. + + By default all variant contexts are gathered in a list and returned. If the asdict parameter is set t True, the + variant contexts are returned as a dictionary. + + Returns + ------- + list or dict + Variant context as list by default, as dict when parameter is set to True + """ if asdict: return self.variant_contexts return [varcon for varcon in self.variant_contexts.values()] # Returns the variant contexts by sample identifier def get_variant_contexts_by_sampleid(self): + """Gathers variant contexts, sorts them by sample identifiers and returns them. + + Returns + ------- + dict + + """ varcons = self.get_variant_contexts() return {x.get_variant_context_sample(): [y for y in varcons if y.get_variant_context_sample() == x.get_variant_context_sample()] for x in varcons} # Returns the number of contexts saved def get_number_of_contexts(self): + """Determines and returns the number of variant contexts. + + Returns + ------- + int + Number of variant contexts + """ return len(self.variant_contexts) # Returns a list of context ids def get_variant_context_ids(self): + """Returns the variant context identifiers. + + Returns + ------- + list of str + Context identifiers of all variant contexts + """ return [x for x in self.variant_contexts.keys()] # Returns a specified variant context. def get_variant_context(self, contextid): + """Checks whether a variant context exists and returns it if so.""" if contextid in self.variant_contexts: return self.variant_contexts[contextid] return None # Returns whether a variant context is present def has_variant_context(self, contextid): + """Checks and returns whether a specified variant context has a variant context. + + Parameters + ---------- + contextid : str + Context identifier + """ return contextid in self.variant_contexts # Returns the acceptor context of the specified variant context. def get_acceptor_context(self, contextid): + """Fetches and returns a specified acceptor context. + + Parameters + ---------- + contextid : str + Context identifier of acceptor context to return + + Returns + ------- + OverlapContext or None + The acceptor context is exists, None if not + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_context() return None # Returns the donor context of the specified variant context. def get_donor_context(self, contextid): + """Fetches and returns a specified donor context. + + Parameters + ---------- + contextid : str + Context identifier of donor context to return + Returns + ------- + OverlapContext or None + The donor context if exists, None if not + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_context() return None # Returns all variant context acceptor reads. def get_all_variant_context_acceptor_reads(self): + """Gathers and returns the acceptor reads of all variant contexts + + Returns + ------- + list of DonorBamRead + Acceptor reads of all variant contexts + """ acceptorreads = [] for varcon in self.variant_contexts.values(): acceptorreads.extend(varcon.get_acceptor_reads()) @@ -228,6 +312,22 @@ def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilte # Returns whether something is in the filter or not. def passes_filter(self, valtocheck, filterlist): + """Tests and returns whether a provided value is in a provided filter. + + Filters are expected to be inclusion filters, denoting a list of values that should be used/included. + + Parameters + ---------- + valtocheck : str + Value to check against a filter list + filterlist : list of str + Values to filter with + + Returns + ------- + bool + True if value is in filter, False otherwise + """ if filterlist is not None: return valtocheck in filterlist return True @@ -235,24 +335,74 @@ def passes_filter(self, valtocheck, filterlist): # ===VARIANT CONTEXT SET OPERATIONS (UNION, INTERSECT, DIFFERENCE)========= # Returns the list of variant context identifiers from both variant context files. def get_variant_contexts_union(self, other_varcon_file): + """Returns all variant context identifiers in both VariantContextFiles. + + Parameters + ---------- + other_varcon_file : VariantContextFile + A VariantContextFile object with VariantContexts + + Returns + ------- + list of str + All variant context identifiers of both VariantContextFile objects + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) | set(other_varcon_ids)) # Returns the list of variant context identifiers present in both variant context files. def get_variant_contexts_intersect(self, other_varcon_file): + """ + + Parameters + ---------- + other_varcon_file : VariantContextFile + VariantContextFile with VariantContext objects + + Returns + ------- + list of str + Shared variant context identifiers + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) & set(other_varcon_ids)) # Returns the list of variant context identifiers in this file but not present in the other variant context file. def get_variant_contexts_difference(self, other_varcon_file): + """Determines and return the variant context identifiers not present in the other variant context file. + + Parameters + ---------- + other_varcon_file : VariantContextFile + VariantContextFile with variant contexts + + Returns + ------- + list of str + Variant context identifiers not present in the other variant context file + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) - set(other_varcon_ids)) # Returns the list of variant context identifiers only present in one of the variant context file but not the other. def get_variant_contexts_symmetric_difference(self, other_varcon_file): + """Determines the variant context identifiers present in either one or the other. + + THe normal difference only returns the context identifiers in A but not in B. The symmetric difference returns + the context identifiers in A but not in B as well as context identifiers in B but not in A. + + Parameters + ---------- + other_varcon_file : VariantContextFile + + Returns + ------- + list of str + Variant context read identifiers + """ own_varcon_ids = self.get_variant_context_ids() other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) ^ set(other_varcon_ids)) @@ -261,6 +411,23 @@ def get_variant_contexts_symmetric_difference(self, other_varcon_file): # Returns a list/hashmap of VariantContextObjects. def get_variant_contexts2(self, aslist=False, varconfilter=None, samplefilter=None, chromfilter=None): + """ + + Parameters + ---------- + aslist : bool + Return variant context in a list + varconfilter : list of str + Contexts to include + samplefilter : list of str + Samples to include + chromfilter : list of str + + Returns + ------- + list or dict + Returns variant contexts as list if aslist set to True, dict otherwise + """ if aslist: return [ x for x in self.variant_contexts.values() @@ -284,6 +451,24 @@ def get_variant_contexts2(self, aslist=False, varconfilter=None, # Main method that returns whether a variant (SNP or indel). def variant_is_in_context(self, varianttype, searchchrom, searchstart, searchstop): + """ + + Parameters + ---------- + varianttype : str + Type of variant (snp/indel) + searchchrom : str + Chromosome name the variant is located on + searchstart : int + Leftmost genomic search window position + searchstop : int + Rightmost genomic search window position + + Returns + ------- + bool or None + + """ if varianttype == "snp": return self.snp_variant_is_in_context(searchchrom, searchstart) if varianttype == "indel": @@ -348,11 +533,32 @@ def add_variant_context(self, varconid, sampleid, # Adds an existing variant context to the variant context file def add_existing_variant_context(self, varconid, varconobj): + """Add an already created variant context to the variant context file. + + The variant context to add should be a valid VariantContext object. It will be added to the variant context + file under the context identifier. + + Parameters + ---------- + varconid : str + Identifier of the context + varconobj : VariantContext + The created variant context + """ if varconobj is not None: self.variant_contexts[varconid] = varconobj # Adds an acceptor context object to a variant context. def set_acceptor_context(self, varconid, acceptor_context): + """Adds an existing acceptor context to a variant context. + + Parameters + ---------- + varconid : str + Context identifier + acceptor_context : OverlapContext + The acceptor context to add + """ if varconid in self.variant_contexts: self.variant_contexts[varconid].set_acceptor_context(acceptor_context) @@ -361,6 +567,25 @@ def add_acceptor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, acceptorreads): + """Creates and adds an acceptor context to a variant context. + + Parameters + ---------- + contextid : str + Context identifier + sampleid: str + Sample name/identifier + contextchrom: str + Chromosome name the context is located on + contextorigin: int + Variant genomic position that created the context + contextstart: int + Leftmost genomic position of the context + contextend: int + Rightmost genomic position of the context + acceptorreads: list of DonorBamRead + List of reads associated with the context + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].add_acceptor_context( contextid, sampleid, @@ -370,6 +595,15 @@ def add_acceptor_context(self, contextid, sampleid, # Sets the donor context object of a variant context def set_donor_context(self, varconid, donor_context): + """Adds an already existing context + + Parameters + ---------- + varconid : str + Context identifier + donor_context : OverlapContext + Donor context to add + """ if varconid in self.variant_contexts: self.variant_contexts[varconid].set_donor_context(donor_context) @@ -668,6 +902,15 @@ def write_donor_left_right_positions(self, outfileloc): # Writes the identifiers of reads that have unmapped mates per # sample to a file. Samples are all donors and the ?template?. def write_reads_with_unmapped_mate(self, typetowrite, umfileloc): + """Writes variant context read identifiers with unmapped mates to a specified output file. + + Parameters + ---------- + typetowrite : str + Write variant context acceptor or donor read identifiers + umfileloc : str + Path to write the output file to + """ try: with open(umfileloc, "w") as umFile: umFile.write("#ContextId\tSampleId\tReadIds\n") @@ -691,8 +934,14 @@ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc): # Writes the unmapped mate id of the acceptor context. def write_acceptor_unmapped_mates(self, umfileloc): - try: + """Writes the acceptor context read identifiers that have unmapped mates to an output file. + Parameters + ---------- + umfileloc : str + Path to write the output file to + """ + try: with open(umfileloc, "w") as umfile: umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): @@ -707,6 +956,13 @@ def write_acceptor_unmapped_mates(self, umfileloc): # Writes the unmapped mate id of the donor context. def write_donor_unmapped_mates(self, umfileloc): + """Writes the donor context read identifiers that have unmapped mates to an output file. + + Parameters + ---------- + umfileloc : str + Path to write the output file to + """ try: with open(umfileloc, "w") as umFile: umFile.write("#ContextId\tSampleId\tReadIds\n") diff --git a/VcfBamScanner.py b/VcfBamScanner.py index 8dc1b55..ac1e627 100644 --- a/VcfBamScanner.py +++ b/VcfBamScanner.py @@ -22,12 +22,24 @@ def __init__(self): # ===METHODS TO GET SAVED DATA FROM THE VCFBAMSCANNER====================== # Returns the map that links VCF files and samples. def get_vcf_sample_map(self): - """Returns a dictionary with VCF/BCF files saved per sample.""" + """Returns a dictionary with VCF/BCF files saved per sample. + + Returns + ------- + vcf_sample_map : dict + Paths to variant files per sample name + """ return self.vcf_sample_map # Returns the map that links BAM files and samples. def get_bam_sample_map(self): - """Returns a dictionary with BAM/CRAM files saved per sample.""" + """Returns a dictionary with BAM/CRAM files saved per sample. + + Returns + ------- + bam_sample_map : dict + Paths to alignment files per sample name + """ return self.bam_sample_map # Returns a map that links VCF files to BAM files. @@ -35,7 +47,13 @@ def get_vcf_to_bam_map(self): """Returns a dictionary with a variant file linked to an alignment file. For each sample, the associated variant file is linked to the associated.alignment file are linked via a - dictionary. The variant and alignment file are saved as key and value respectively.""" + dictionary. The variant and alignment file are saved as key and value respectively. + + Returns + ------- + vcf_to_bam_map : dict + Variant files linked to alignment files + """ vcf_to_bam_map = {} for sampleid in self.vcf_sample_map: if sampleid in self.bam_sample_map: @@ -50,7 +68,19 @@ def scan_vcf_folders(self, vcffolders): Iterates over a list of paths to folders containing VCF/BCF files. A non-existing folder is skipped. For each variant file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a - dictionary.""" + dictionary. + + Parameters + ---------- + vcffolders : list + List of paths to folders contain alignment files + + Returns + ------- + vcf_sample_map : dict + String paths to variant files per sample name + + """ self.vaselogger.info("Start scannig VCF files") for vcffolder in vcffolders: @@ -91,7 +121,18 @@ def scan_bam_folders(self, bamfolders): Iterates over a list of paths to folders containing BAM/CRAM files. Non-existing folders are skipped. For each alignment file in a valid folder, the sample name is extracted.and the file is saved under its sample name in a - dictionary.""" + dictionary. + + Parameters + ---------- + bamfolders : list + List of String paths to folders containing alignment files + + Returns + ------- + bam_sample_map : dict + String paths to alignment files per sample name + """ self.vaselogger.info("Start scanning BAM files") # Scan BAM files in all provided folders. @@ -128,13 +169,35 @@ def scan_bam_folders(self, bamfolders): # Scans the provided BAM/CRAM files from a provided donor list file def scan_bamcram_files(self, bamcramlistfile): - """Reads a list file containing the locations of BAM/CRAM files and saves them per sample.""" + """Reads a list file containing the locations of BAM/CRAM files and saves them per sample. + + Parameters + ---------- + bamcramlistfile : str + String path to list file with paths to alignment files + + Returns + ------- + bam_sample_map : dict + Paths to alignment files per sample name + """ self.bam_sample_map = self.read_donorlistfile(bamcramlistfile) return self.bam_sample_map # Scan the provided VCF files from a provided donor list file def scan_vcf_files(self, vcflistfile): - """Reads a list file containing the locations of VCF/BCF files and saves them per sample name.""" + """Reads a list file containing the locations of VCF/BCF files and saves them per sample name. + + Parameters + ---------- + vcflistfile : str + String path to list file with paths to variant files + + Returns + ------- + vcf_sample_map : dict + Variant files per sample name + """ self.vcf_sample_map = self.read_donorlistfile(vcflistfile, "v") return self.vcf_sample_map @@ -160,7 +223,18 @@ def read_donorlistfile(self, listfileloc, listtype="a"): # Checks whether the BAM file contains a sample name. def bam_has_sample_name(self, bamfile): - """Checks and returns whether a BAM/CRAM file has a sample name/identifier.""" + """Checks and returns whether a BAM/CRAM file has a sample name/identifier. + + Parameters + ---------- + bamfile : pysam AlignmentFile + Already opened pysam AlignmentFile + + Returns + ------- + bool + True, if file has a sample name, otherwise False + """ if "RG" in bamfile.header: if len(bamfile.header["RG"]) > 0: if "SM" in bamfile.header["RG"][0]: @@ -169,14 +243,30 @@ def bam_has_sample_name(self, bamfile): # Checks whether the VCF file has a sample name def vcf_has_sample_name(self, vcffile): - """Checks and returns whether the VCF/BCF file has a sample name/identifier.""" + """Checks and returns whether the VCF/BCF file has a sample name/identifier. + + Returns + ------- + bool + True if the variant file has sample names, otherwise False + """ if vcffile.header.samples[0] != "": return True return False # Returns the first sample identifier of the VCF file (as for now we only work with one sample per file def get_vcf_sample_name(self, vcffileloc): - """Extracts and returns the list of sample names from a specified VCF file.""" + """Extracts and returns the list of sample names from a specified VCF file. + + Parameters + vcffileloc : str + Path to a VCF file + + Returns + ------- + list or None + VCF Sample name if present, otherwise None + """ vcffile = pysam.VariantFile(vcffileloc, "r") if self.vcf_has_sample_name(vcffile): return list(vcffile.header.samples) @@ -189,7 +279,18 @@ def get_bam_sample_name(self, bamfileloc): The sample name is extracted by first checking whether the BAM file contains a sample name via the VcfBamScanner method bam_has_sample_name(). If so, the sample name is extracted from the BAM file by returning the value of the 'SM' field from the header line starting with '@RG'. None is returned if the BAM file has no - sample name.""" + sample name. + + Parameters + ---------- + bamfileloc : str + Path to a BAM file + + Returns + ------- + str or None + Sample name if present, otherwise None + """ try: bamfile = pysam.AlignmentFile(bamfileloc, "rb") if self.bam_has_sample_name(bamfile): @@ -207,7 +308,19 @@ def get_cram_sample_name(self, cramfileloc): The sample name is extracted by first checking whether the CRAM file contains a sample name via the VcfBamScanner method cram_has_sample_name(). If so, the sample name is extracted from the CRAM file by returning the value of the 'SM' field from the header line starting with '@RG'. None is returned if the CRAM - file has no sample name.""" + file has no sample name. + + Parameters + ---------- + cramfileloc : str + Path to a CRAM file + + + Returns + ------- + str or None + Sample name if present, otherwise None + """ try: cramfile = pysam.AlignmentFile(cramfileloc, "rc") if self.bam_has_sample_name(cramfile): @@ -235,14 +348,26 @@ def get_donor_file_type(self, donorfileloc): """Determines and returns the file type of a specified donor file. To determine the file type, the linux command 'file' is used which returns a String containing the file type - info. The received String is split on spaces and the resulting list is returned.""" + info. The received String is split on spaces and the resulting list is returned. + + Returns + ------- + list of str + List with file type data + """ filetype_proc = subprocess.Popen(["file", "-b", "-z", donorfileloc], stdout=subprocess.PIPE) filetype_data, filetype_err = filetype_proc.communicate() return filetype_data.decode().split(" ") # Returns a list of sample identifiers that have both a VCF and a BAM/CRAM def get_complete_sample_ids(self): - """Determines which samples have a variant and alignment file and the list of identifiers.""" + """Determines which samples have a variant and alignment file and returns the list of identifiers. + + Returns + ------- + set + Set of sample names with a variant and alignment file + """ return set(self.vcf_sample_map.keys()) & set(self.bam_sample_map.keys()) # Extracts the sequence names from an already opened BAM/CRAM file @@ -250,7 +375,18 @@ def get_alignment_sequence_names(self, alignment_file): """Extracts and returns the chromosome names from an alignment file. Chromosome names are extracted from a specified BAM or CRAM file by checking the headers for the 'SQ' tag. If - present the SQ entry is checked for the 'SN' field which contains the specific chromosome names.""" + present the SQ entry is checked for the 'SN' field which contains the specific chromosome names. + + Parameters + ---------- + alignment_file : pysam.AlignmentFile + Already opened pysam AlignmentFile + + Returns + ------- + sequence_names : list + List with extracted chromosome names + """ sequence_names = set() if "SQ" in alignment_file.header: for sn_entry in alignment_file.header["SQ"]: @@ -262,7 +398,18 @@ def get_variant_sequence_names(self, variant_file): """Extracts and returns the chromosome names from a variant file. Chromosome names are extracted from a specified VCF or BCF file by obtaining the header contig names via the - pysam method .header.contigs().""" + pysam method .header.contigs(). + + Parameters + ---------- + variant_file : pysam.VariantFile + Already opened pysam VariantFile + + Returns + ------- + sequence_names : list of str + List with extracted chromosome names + """ sequence_names = set() if len(variant_file.header.contigs) > 0: for seqname in list(variant_file.header.contigs): From 30f2f640cc1c206940e343a586bb5da52c2fe18c Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 10 Sep 2019 14:41:01 +0200 Subject: [PATCH 19/90] Add much more numpy style docstrings --- DonorBamRead.py | 2 - OverlapContext.py | 35 +- ParamChecker.py | 356 ++++++++++++++++++-- ReadIdObject.py | 62 +++- VariantContext.py | 760 +++++++++++++++++++++++++++++++++++++----- VariantContextFile.py | 93 +++++- VcfBamScanner.py | 66 ++-- VcfVariant.py | 133 +++++++- 8 files changed, 1303 insertions(+), 204 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 3f75eb2..9065644 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -109,7 +109,6 @@ def get_bam_read_ref_end(self): return self.bam_read_ref_pos + self.bam_read_length return -1 - # Returns the BAM read sequence. def get_bam_read_sequence(self): """Returns the read sequence as a String. @@ -120,7 +119,6 @@ def get_bam_read_sequence(self): """ return self.bam_read_seq - # Returns the BAM read quality scores. def get_bam_read_qual(self): """Returns the read quality. diff --git a/OverlapContext.py b/OverlapContext.py index eca06ae..284ce37 100644 --- a/OverlapContext.py +++ b/OverlapContext.py @@ -39,7 +39,6 @@ def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, self.unmapped_read_mate_ids = [] # ===METHODS TO GET DATA OF THE OVERLAP CONTEXT============================ - # Returns the context data (chrom, pos, start, end) in an array def get_context(self): """Returns the essential data of the context. @@ -53,7 +52,6 @@ def get_context(self): """ return [self.context_chrom, self.context_origin, self.context_start, self.context_end] - # Returns the context identifier. def get_context_id(self): """Returns the context identifier. @@ -64,7 +62,6 @@ def get_context_id(self): """ return self.context_id - # Returns the sample identifier the context was based on. def get_sample_id(self): """Returns the identifier of the sample the context is @@ -75,7 +72,6 @@ def get_sample_id(self): """ return self.sample_id - # Returns the context chromosome name the context is located on. def get_context_chrom(self): """Returns the chromosome name that the context is located on. @@ -86,7 +82,6 @@ def get_context_chrom(self): """ return self.context_chrom - # Returns the origin position of the context. def get_context_origin(self): """Returns the the position of the variant that was used to construct the context. @@ -139,7 +134,6 @@ def get_unmapped_read_mate_ids(self): return self.unmapped_read_mate_ids # ===METHODS TO GET DATA (REQUIRING SOME CALCULATION) FROM THE OVERLAP CONTEXT============= - # Returns the length of the context. def get_context_length(self): """Subtracts the left most genomic position of the context from the right most and returns the difference. @@ -150,7 +144,6 @@ def get_context_length(self): """ return abs(self.context_end - self.context_start) - # Returns the distance of the context start from the context origin. def get_start_distance_from_origin(self): """Subtracts the left most position of the context from the variant position and returns the difference. @@ -161,7 +154,6 @@ def get_start_distance_from_origin(self): """ return abs(self.context_origin - self.context_start) - # Returns the distance of the context end from the context origin. def get_end_distance_from_origin(self): """Subtracts the variant position from the rightmost position of the context and returns the difference. @@ -173,7 +165,6 @@ def get_end_distance_from_origin(self): return abs(self.context_end - self.context_origin) # ===METHODS TO OBTAIN CONTEXT READ INFORMATION============================ - # Returns the number of saved context reads. def get_number_of_context_reads(self): """Returns the number of reads associated with the context. @@ -186,7 +177,6 @@ def get_number_of_context_reads(self): return 0 return len(self.context_bam_reads) - # Returns a list of BAM read identifiers in the current context. def get_context_bam_read_ids(self): """Returns a list of the identifiers of reads associated with the context. @@ -199,7 +189,6 @@ def get_context_bam_read_ids(self): return [None] return list(set([x.get_bam_read_id() for x in self.context_bam_reads])) - # Returns a list of all left positions for all BAM reads. def get_context_bam_read_starts(self): """Returns a list of all let most position of all reads associated with the context. @@ -212,7 +201,6 @@ def get_context_bam_read_starts(self): return [None] return [x.get_bam_read_ref_pos() for x in self.context_bam_reads] - # Returns a list of all left positions for all R1 BAM reads. def get_context_bam_read_left_positions(self): """Returns a list with the left most positions for all forward/R1 reads associated with the context. @@ -226,7 +214,6 @@ def get_context_bam_read_left_positions(self): return [x.get_bam_read_ref_pos() for x in self.context_bam_reads if (x.is_read1())] - # Returns a list of BAM read ending positions for all BAM reads. def get_context_bam_read_ends(self): """Returns a list with the rightmost genomic positions of all reads associated with the context. @@ -239,7 +226,6 @@ def get_context_bam_read_ends(self): return [None] return [x.get_bam_read_ref_end() for x in self.context_bam_reads] - # Returns a list of all right positions for all R2 BAM reads. def get_context_bam_read_right_positions(self): """Returns a list with the rightmost genomic positions for all reverse/R2 reads associated with the context. @@ -253,7 +239,6 @@ def get_context_bam_read_right_positions(self): return [x.get_bam_read_ref_end() for x in self.context_bam_reads if (x.is_read2())] - # Returns a list of all lengths for all BAM reads. def get_context_bam_read_lengths(self): """Returns a list with the lengths of all reads associated with the context. @@ -264,7 +249,6 @@ def get_context_bam_read_lengths(self): """ return [x.get_bam_read_length() for x in self.context_bam_reads] - # Returns a list of BAM read sequences in the current context. def get_context_bam_read_seqs(self): """Returns a list with the sequences of all reads associated with the context. @@ -275,7 +259,6 @@ def get_context_bam_read_seqs(self): """ return [x.get_bam_read_sequence() for x in self.context_bam_reads] - # Returns a list of qualities of all BAM reads. def get_context_bam_read_qualities(self): """Returns a list with the qualities lines of all reads associated with the context. @@ -286,7 +269,6 @@ def get_context_bam_read_qualities(self): """ return [x.get_bam_read_qual() for x in self.context_bam_reads] - # Returns a list of Q-scores of all BAM reads. def get_context_bam_read_q_scores(self): """Returns a list with the Q-Scores of all reads associated with the context. @@ -300,7 +282,6 @@ def get_context_bam_read_q_scores(self): """ return [x.get_bam_read_q_scores() for x in self.context_bam_reads] - # Returns a list of all BAM read MapQ values. def get_context_bam_read_map_qs(self): """Collects and returns the list of MAPQ values of all reads. @@ -311,7 +292,6 @@ def get_context_bam_read_map_qs(self): """ return [x.get_mapping_qual() for x in self.context_bam_reads] - # Returns whether a BAM read is in the context based on the provided read identifier. def read_is_in_context(self, readid): """Checks whether a read specified by an id is associated with the context. @@ -325,7 +305,6 @@ def read_is_in_context(self, readid): return readid in self.get_context_bam_read_ids() # ===METHODS TO ADD/SET CONTEXT DATA======================================= - # Adds the read id of a BAM read with an unmapped mate. def add_unmapped_mate_id(self, ureadid): """Adds the identifier of a read with an unmapped mate to the context unmapped mates list. @@ -336,7 +315,6 @@ def add_unmapped_mate_id(self, ureadid): """ self.unmapped_read_mate_ids.append(ureadid) - # Sets the list of unmapped read mate ids. def set_unmapped_mate_ids(self, mateids): """Sets the provided read id list as the unmapped mate id list associated with the context. @@ -347,7 +325,6 @@ def set_unmapped_mate_ids(self, mateids): """ self.unmapped_read_mate_ids = mateids - # Returns whether a BAM read in the context has an unmapped mate. def read_has_unmapped_mate(self, readid): """Checks if a read specified by a read identifier has an unmapped mate and returns True or False. @@ -359,7 +336,6 @@ def read_has_unmapped_mate(self, readid): return readid in self.unmapped_read_mate_ids # ===STATISTICS METHODS FOR A VARIANT CONTEXT============================== - # Returns the average and median read length. def get_average_and_median_read_length(self): """Calculates and return the mean and median red length of all reads associated with the context. @@ -376,7 +352,6 @@ def get_average_and_median_read_length(self): avgmedlen.append(contextread.get_bam_read_length()) return [statistics.mean(avgmedlen), statistics.median(avgmedlen)] - # Returns the average and median read quality. def get_average_and_median_read_qual(self): """Calculates and returns the mean and median Q-Score of all reads associated with the context. @@ -392,9 +367,8 @@ def get_average_and_median_read_qual(self): avgmedqual.append(contextread.get_average_qscore()) return [statistics.mean(avgmedqual), statistics.median(avgmedqual)] - # Returns the average and median read MapQ of this variant context. def get_average_and_median_read_map_q(self): - """Calculates the mean and median MAPQ value of all reads associated witht the context. + """Calculates the mean and median MAPQ value of all reads associated with the context. Returns ------- @@ -409,7 +383,6 @@ def get_average_and_median_read_map_q(self): return [statistics.mean(avgmedmapq), statistics.median(avgmedmapq)] # ===SOME OTHER METHODS==================================================== - # Returns a string representation of the overlap context. def to_string(self): """Assembles and returns a String representation of the context. @@ -433,7 +406,6 @@ def to_string(self): + str(countbamreads) + "\t" + str(bamids)) - # Returns a statistics string representation of the overlap context. def to_statistics_string(self): """Calculates some basic statistics of the context in a tab separated String. @@ -449,7 +421,6 @@ def to_statistics_string(self): f"{avgmedquals[0]}\t{avgmedquals[1]}\t{avgmedmapq[0]}\t" f"{avgmedmapq[1]}") - # Compares the current OverlapContext to another OverlapContext and returns the differences. def compare(self, other_overlap_context): """Compares the current context to a provided context and returns the differences. @@ -459,11 +430,13 @@ def compare(self, other_overlap_context): Parameters ---------- - other_overlap_context + other_overlap_context : OverlapContext + Acceptor/Donor context to compare this acceptor/donor context with Returns ------- differences : dict + Set of differences between the compared acceptor/donor contexts. """ differences = {} if self.context_id != other_overlap_context.get_context_id(): diff --git a/ParamChecker.py b/ParamChecker.py index d59ca71..f8b86a7 100644 --- a/ParamChecker.py +++ b/ParamChecker.py @@ -4,11 +4,45 @@ class ParamChecker: - # Constructor that creates two empty arrays that + """Checks and saves the values of provided parameters by the user. + + Attributes + ---------- + vaselogger + vcf_filelist : str + Path to the variant list file + bam_filelist : str + Path to the alignment list file + acceptorbam : str + Path to the alignment file to use as acceptor + fastq_in1 : list of str + Paths to R1 fastq files to us as template + fastq_in2 : list of str + Paths to R2 fastq files to use as template + outdir : str + Path to directory to write output files to + reference_file : str + Path to the genome reference in fasta format + fastq_out_location : str + Path to write the validation fastq files to + varcon_out_location : str + Output name for the variant context file + log_location : str + Output path to write the log file to + variantlist_location : str + runmode : str + The mode to run VaSeBuilder in + varconin : str + Path to a variant context input file + donorfqlist : str + required_mode_parameters : dict + Contains the required parameters for each runmode + """ + def __init__(self): self.vaselogger = logging.getLogger("VaSe_Logger") - self.vcf_filelist = [] - self.bam_filelist = [] + self.vcf_filelist = "" + self.bam_filelist = "" self.acceptorbam = "" self.fastq_in1 = "" self.fastq_in2 = "" @@ -36,8 +70,21 @@ def __init__(self): "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]} self.optional_parameters = ["fastqout", "varcon", "variantlist"] - # Checks whether the required parameters for a run mode have been set. def required_parameters_set(self, runmode, vase_arg_vals): + """Checks and returns whether all required run mode parameters have been set. + + Parameters + ---------- + runmode : str + Runmode to check parameters for + vase_arg_vals : dict + Dictionary with parameters values + + Returns + ------- + bool + True if all parameters are correct, False if not + """ if runmode in self.required_mode_parameters: for reqparam in self.required_mode_parameters[runmode]: if vase_arg_vals[reqparam] is None: @@ -49,15 +96,37 @@ def required_parameters_set(self, runmode, vase_arg_vals): self.vaselogger.debug("Invalid run mode selected.") return False - # Checks whether the optional parameters have been set or not. If not, they will be set to None def optional_parameters_set(self, vase_arg_vals): + """Checks and sets optional parameters if they have not been set. + + Parameters + ---------- + vase_arg_vals: + Dictionary with parameter values + + Returns + ------- + vase_arg_vals : dict + Updated dictionary with parameter values + """ for optparam in self.optional_parameters: if optparam not in vase_arg_vals: vase_arg_vals[optparam] = None return vase_arg_vals - # Check the logging parameter to determine where to write the logfile to. def check_log(self, logparam): + """Checks the log parameter value and returns a proper output location to write the log file to. + + Parameters + ---------- + logparam : str + The parameter value for the log file + + Returns + ------- + self.log_location : str + Output path to write the log file to + """ logloc = "VaSeBuilder.log" if logparam is not None: @@ -73,8 +142,24 @@ def check_log(self, logparam): self.log_location = logloc return self.log_location - # Checks whether provided folders exist. def check_folders_exist(self, paramvals, file_exts): + """Checks and returns whether folder containing specified file types exist. + + For each provided folder, it checks whether the folder contains at least one file with the specified file type. + If so, the folder is added to a list. + + Parameters + ---------- + paramvals : list of str + Folders to check for containing specified file type + file_exts : str + Extension of file types to check for + + Returns + ------- + existing_folders : list of str + Folders containing files with the specified extension + """ existing_folders = [] # Check whether the provided folders exist. @@ -111,6 +196,18 @@ def check_folder_contents(self, folder_to_check, file_exts): # Checks whether a provided file exists. def check_file_exists(self, fileloc): + """Checks and returns whether one or more files exist. + + Parameters + ---------- + fileloc : str or list of str + Path to file or list of paths to files + + Returns + ------- + bool + True if file(s) exists. + """ if type(fileloc) == str: fileloc = [fileloc] for this_file in fileloc: @@ -120,14 +217,36 @@ def check_file_exists(self, fileloc): self.vaselogger.debug("Files all exist.") return True - # Return the directory name of an output location. def is_valid_output_location(self, outfilename): + """Checks and returns whether the output location is valid. + + Parameters + ---------- + outfilename : str + Output file path to check + + Returns + ------- + """ return os.path.isdir(os.path.dirname(outfilename)) - # Checks whether the values of the parameters are correct (do files/folders exist for example). - # [Function should perhaps be split into smaller functions] def check_parameters(self, vase_arg_vals): - + """Checks and returns whether the set parameters are ok. + + For parameters with paths to input files, it is checked whether the file exists. For the output directory it is + checked whether the folder exists. Optional parameters, if in the parameter value set, are set to a default + value if none is given. + + Parameters + ---------- + vase_arg_vals : dict + Parameter values + + Returns + ------- + bool + True if set parameters are correct, False if not + """ # Loop over the provided parameters. for param in vase_arg_vals: self.vaselogger.debug(f"Checking parameter {param}") @@ -225,6 +344,24 @@ def check_parameters(self, vase_arg_vals): # Only checks whether the required runmode parameters are ok using 'check_parameters()' def check_required_runmode_parameters(self, runmode, vaseargvals): + """Checks and returns whether all required run modes parameters are set and correct. + + The parameter value set is first subsetted to only include all required and optional parameters. For run mode + 'X', the 'templatefq1' parameter is not required and would therefore be filtered out by this method even if it + has been set. + + Parameters + ---------- + runmode : str + Selected run mode + vaseargvals : dict + Parameter values + + Returns + ------- + bool + True if all required parameters are ok, False if not + """ if runmode in self.required_mode_parameters: required_params = self.required_mode_parameters[runmode] @@ -237,12 +374,45 @@ def check_required_runmode_parameters(self, runmode, vaseargvals): # Returns the name of the folder name of a parameter value (if the parameter value is ). def get_folder_name(self, foldername): + """Returns the name of the folder for a given path. + + If the provided path is a folder the the provided value is returned. If the provided path is a file, the path to + the parent folder is returned. + + Parameters + ---------- + foldername : str + Path to a valid folder + + Returns + ------- + str + Foldername parameter if foldername is a folder, the path to the parent folder if foldername is a file + """ if os.path.isfile(foldername) or (not os.path.isdir(foldername)): return os.path.dirname(foldername) return foldername # Returns the name of an output file (is used for parameters fastqout, varcon, donorbread and acceptorbread). def get_output_name(self, outfilename, defaultoutname): + """Checks and returns the name of an output file. + + The output name is first checked whether it is a path. If so, only the last part (the filename) is returned. If + no filename or path has been provided, the provided default output name is returned. Only the name, and not + path, is returned as the output folder to write to is determined by the 'out' parameter. + + Parameters + ---------- + outfilename : str + The output file name + defaultoutname : str + The default output name to provide if non has been set/given + + Returns + ------- + str + Returns the output name + """ if outfilename is not None: if "/" in outfilename: return outfilename.split("/")[-1] @@ -251,91 +421,209 @@ def get_output_name(self, outfilename, defaultoutname): return outfilename return defaultoutname - # Returns the list of valid VCF folders. def get_valid_vcf_filelist(self): + """Returns the location of the text file containing a list of donor variant files to use. + + Returns + ------- + self.vcf_filelist : str + Path to the variant list file + """ return self.vcf_filelist - # Returns the list of valid BAM folders. def get_valid_bam_filelist(self): - """Returns the location of the text file containing a list of donor alignment files to use.""" + """Returns the location of the text file containing a list of donor alignment files to use. + + Returns + ------- + self.bamfile_list : str + Path to the alignment list file + """ return self.bam_filelist - # Returns the location of the acceptor BAM file location. def get_acceptor_bam(self): - """Returns the location of the BAM/CRAM file that will be used as the acceptor.""" + """Returns the location of the BAM/CRAM file that will be used as the acceptor. + + Returns + ------- + self.acceptorbam : str + Path to the alignment file to use as acceptor + """ return self.acceptorbam - # Returns the location and name of the first (R1) fastq input file location. def get_first_fastq_in_location(self): + """Returns the paths to R1 fastq files that serve as the template for the validation set. + + Returns + ------- + self.fastq_in1 : list of str + Paths to R1 template fastq files + """ return self.fastq_in1 - # Returns the location and name of the second (R2) fastq input file location. def get_second_fastq_in_location(self): + """Returns the paths to R2 fastq files that serve as the template for the validation set. + + Returns + ------- + self.fastq_in2 : list of str + Paths to R2 template fastq files + """ return self.fastq_in2 - # Returns the location(s) and names of the two (R1 and R2) fastq input files. def get_fastq_in_locations(self): + """ + + Returns + ------- + list + """ return [self.fastq_in1, self.fastq_in2] - # Returns the location to write the output to. def get_out_dir_location(self): - """Returns the directory to write the output files. Adds '/' if not present in the ouput directory location.""" + """Returns the directory to write the output files. Adds '/' if not present in the ouput directory location. + + Returns + ------- + str + Path to the output directory + """ if self.outdir.endswith("/"): return self.outdir return self.outdir + "/" - # Returns the location of the reference fasta file def get_reference_file_location(self): - """Returns the location of the genomic reference file.""" + """Returns the location of the genomic fasta reference file. + + Returns + ------- + self.reference_file : str + Path to the genomic reference fasta file + """ return self.reference_file # Returns the location of the FastQ file that will be produced by VaSeBuilder. def get_fastq_out_location(self): + """Returns the fastq out location and suffix. + + Before returning the path and output suffix, it is checked whether the path to the outdir location ends with a + '/'. If not this is added. The path and suffix is not a full output path. The full output path is constructed + when writing the new validation fastq files. + + Returns + ------- + str + Path and suffix for the fastq out files + """ if self.outdir.endswith("/"): return self.outdir + self.fastq_out_location return self.outdir + "/" + self.fastq_out_location # Returns the location of file that will contain the variants and their context start and stops. def get_variant_context_out_location(self): + """Constructs and returns the output path for the variant context file. + + Returns + ------- + str + Path to the variant context output location + """ if self.outdir.endswith("/"): return self.outdir + self.varcon_out_location return self.outdir + "/" + self.varcon_out_location - # Retuns the location to write the log file(s) to. def get_log_file_location(self): - """Returns the location the log file will be written to.""" + """Returns the location the log file will be written to. + + Returns + ------- + self.log_location : str + Path to write the log file to + """ return self.log_location - # Returns the variant list location def get_variant_list_location(self): + """Returns the path to the file containing variants to use. + + Returns + ------- + self.variantlist_location : str + Path to the variant list file + """ return self.variantlist_location - # Returns the location of the list file containing donor fastq files def get_donorfqlist(self): + """Returns the location of the list file containing donor fastq files. + + Returns + ------- + self.donorfqlist : str + Path to the donor fastq list file + """ return self.donorfqlist - # Returns the specified runmmode def get_runmode(self): - """Returns the value of the runmode parameter.""" + """Returns the value of the run mode parameter. + + Returns + ------- + self.runmode : str + The set runmode + """ return self.runmode - # Returns the location of the variant context in file def get_variantcontext_infile(self): - """Returns the saved value of the varconin CLI parameter.""" + """Returns path to the variant context input file. + + Returns + ------- + self.varconin : str + Path to the variant context input file + """ return self.varconin - # Returns the list of required parameters for a specified runmode def get_required_runmode_parameters(self, runmode): + """Returns the required parameters for a specified run mode. + + Parameters + ---------- + runmode : str + The specified run mode + + Returns + ------- + list of str + List with the names of the required parameters + """ if runmode.upper() in self.required_mode_parameters: return self.required_mode_parameters[runmode.upper()] return [] - # Returns the optional parameters def get_optional_parameters(self): + """Returns the optional parameters + + Returns + ------- + optional_parameters : list of str + The list of optional program parameters + """ return self.optional_parameters - # Returns the def filter_provided_parameters(self, runmode, vase_parameters): + """Filters and returns a parameter set with only the required and optional parameters + + Parameters + ---------- + runmode : str + The specified run mode + vase_parameters : str + Dictionary with parameter values (not used) + + Returns + ------- + filtered_parameter_set : dict + Parameter setf + """ filtered_parameter_set = {} runmode_reqparams = self.get_optional_parameters() if runmode in self.required_mode_parameters: diff --git a/ReadIdObject.py b/ReadIdObject.py index b935070..58f2742 100644 --- a/ReadIdObject.py +++ b/ReadIdObject.py @@ -1,36 +1,76 @@ class ReadIdObject: """The ReadIdObject is used to save the read identifiers. - This class is used when a VariantContextFile is read. DonorBamRead can't be used as most data is missing.""" + This class is used when a VariantContextFile is read. DonorBamRead can't be used as most data is missing. + + Attributes + ---------- + read_id : str + Identifier of the read + pair_number : str + Pair number of the read + """ def __init__(self, readid): """Constructor saves the provided read identifier and sets the pairnumber to a default empty value.""" self.read_id = readid self.pair_number = "" - # Returns the name/identifier of the read id object def get_bam_read_id(self): - """Returns the identifier of the read.""" + """Returns the identifier of the read. + + Returns + ------- + self.read_id : str + Identifier of the read + """ return self.read_id - # Sets the name/identifier of the read id object def set_bam_read_id(self, readid): - """Sets the identifier of a read.""" + """Sets the identifier of the read. Overwrites any already existing value. + + Parameters + ---------- + readid : str + The read identifier to set + """ self.read_id = readid - # Return the pair number of the read id object def get_bam_read_pair_number(self): - """Returns the read pair number of the read.""" + """Returns the read pair number of the read. + + Returns + ------- + self.pair_number : str + The read pair number + """ return self.pair_number - # Sets the pair number of the read id object def set_bam_read_pair_number(self, pairnum): - """Sets the read pair number of the read.""" + """Sets the read pair number of the read. Overwrites any already existing value. + + Parameters + ---------- + pairnum : str + The pair number to set + """ self.pair_number = pairnum def is_read1(self): - """Returns whether the read is the forward/R1 read.""" + """Returns whether the read is the forward/R1 read. + + Returns + ------- + bool + True if read pair number is 1., False if not + """ return self.pair_number == "1" def is_read2(self): - """Returns whether the read is the reverse/R2 read.""" + """Returns whether the read is the reverse/R2 read. + + Returns + ------- + bool + True if read pair number is 2, Fale if not + """ return self.pair_number == "2" diff --git a/VariantContext.py b/VariantContext.py index 11c5f94..29a87c3 100644 --- a/VariantContext.py +++ b/VariantContext.py @@ -7,6 +7,28 @@ class VariantContext: Attributes ---------- + context_id : str + Identifier of the context + sample_id : str + The name/identifier of the donor sample used to construct the variant context + variant_context_chrom : str + Chromosome name the context is lcoated on + variant_context_origin : int + Variant position the context is constructed from + variant_context_start : int + Leftmost genomic position of the variant context + variant_context_end : int + Rightmost genomic position of the variant context + variant_context_areads : list of DonorBamRead + Acceptor reads associated with the variant context + variant_context_dreads : list of DonorBamRead + Donor reads associated with the variant context + variant_acceptor_context : OverlapContext + Acceptor context used to construct the variant context + variant_donor_context : OverlapContext + Donor context used to construct the variant context + unmapped_donor_mate_ids : list of str + Identifiers of reads with an unmapped mate """ # Sets the variant context data. @@ -15,6 +37,33 @@ def __init__(self, varconid, sampleid, varconstart, varconend, acceptorreads, donorreads, acceptor_context=None, donor_context=None): + """Saves the provided variant context data. + + Acceptor and donor contexts are optional when creating the variant context. + + Parameters + ---------- + varconid : str + Variant context identifier + sampleid : str + Sample name/identifier + varconchrom : str + Chromosome name the variant context is located on + varconorigin: + Variant position the context is constructed from + varconstart : int + Leftmost genomic position of the variant context + varconend : int + Rightmost genomic position of the variant context + acceptorreads : list of DonorBamRead + Variant context acceptor reads + donorreads : list of DonorBamRead + Variant context donor reads + acceptor_context : OverlapContext + Acceptor context associated with the variant context + donor_context : OverlapContext + Donor context associated with the variant context + """ self.context_id = varconid self.sample_id = sampleid self.variant_context_chrom = varconchrom @@ -33,7 +82,6 @@ def __init__(self, varconid, sampleid, self.unmapped_donor_mate_ids = [] # ===METHODS TO OBTAIN DATA FROM THE VARIANT CONTEXT DATA================== - # Returns the variant context identifier. def get_variant_context_id(self): """Returns the variant context identifier. @@ -44,25 +92,28 @@ def get_variant_context_id(self): """ return self.context_id - # Returns the variant context sample. def get_variant_context_sample(self): - """Returns the identifier of the sample + """Returns the name/identifier of the sample Returns ------- self.sample_id : str - Sample identifier + Sample name/identifier """ return self.sample_id - # Returns the variant context chromosome. def get_variant_context_chrom(self): - """""" + """Returns the chromosome name the variant context is located on. + + Returns + ------- + self.variant_context_chrom : str + Chromosome name of the variant context + """ return self.variant_context_chrom - # Returns the variant context origin. def get_variant_context_origin(self): - """Returns the variant position that + """Returns the genomic position of the variant used to construct the variant context. Returns ------- @@ -71,7 +122,6 @@ def get_variant_context_origin(self): """ return self.variant_context_origin - # Returns the variant context start position. def get_variant_context_start(self): """Returns the leftmost genomic position of the variant context. @@ -82,16 +132,35 @@ def get_variant_context_start(self): """ return self.variant_context_start - # Returns the variant context end position. def get_variant_context_end(self): + """Returns the variant context end position. + + Returns + ------- + self.variant_context_end : int + Variant context rightmost genomic position + """ return self.variant_context_end - # Returns the variant context acceptor reads. def get_acceptor_reads(self): + """Returns the variant context acceptor reads. + + Returns + ------- + self.variant_context_areads : list of DonorBamRead + Variant context acceptor reads + """ return self.variant_context_areads # Returns the variant context donor reads. def get_donor_reads(self): + """Returns the variant context donor reads. + + Returns + ------- + self.variant_context_dreads : list of DonorBamRead + Variant context acceptor reads + """ return self.variant_context_dreads # TODO: New thing for -P mode. @@ -104,12 +173,25 @@ def get_donor_read_strings(self): dbr.get_bam_read_qual())) return list(set(donorreads)) - # Returns the acceptor context (the context from acceptor reads overlapping the variant itself). def get_acceptor_context(self): + """Returns the acceptor context used to construct the variant context. + + Returns + ------- + self.variant_acceptor_context : OverlapContext + Acceptor context associated with the variant context + """ return self.variant_acceptor_context # Returns the donor context (the context from donor reads overlapping the variant itself). def get_donor_context(self): + """Returns the donor context used to construct the variant context. + + Returns + ------- + self.variant_donor_context : OverlapContext + Donor context associated with the variant context + """ return self.variant_donor_context # Returns a list of acceptor reads overlapping with the variant context. @@ -121,98 +203,222 @@ def get_unmapped_donor_mate_ids(self): return self.unmapped_donor_mate_ids # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE VARIANT CONTEXT=============== - # Returns the variant context length. def get_variant_context_length(self): + """Returns the length of the variant context. + + The length if determined by subtracting the leftmost genomic (start) position from the rightmost genomic (end) + position. + + Returns + ------- + int + Variant context length + """ return abs(self.variant_context_end - self.variant_context_start) - # Returns the distance of the variant context start position from the variant context origin. def get_start_distance_from_origin(self): + """Calculates and returns the variant context start from the variant position that constructed it. + + Returns + ------- + int + Distance between variant context leftmost genomic and variant position. + """ return abs(self.variant_context_origin - self.variant_context_start) - # Returns the distance of the variant context end position from the variant context origin. def get_end_distance_from_origin(self): + """Calculates and returns the variant context end from the variant position tha constructed it. + + Returns + ------- + int + Distance between variant context rightmost genomic and variant position + """ return abs(self.variant_context_end - self.variant_context_origin) # ===METHODS TO OBTAIN VARIANT CONTEXT ACCEPTOR READ DATA================== - # Returns the number of variant context acceptor reads. def get_number_of_acceptor_reads(self): + """Returns the number of variant context acceptor reads. + + Returns + ------- + int + Number of variant context acceptor reads + """ if self.variant_context_areads is None: return 0 return len(self.variant_context_areads) - # Returns the identifiers of acceptor reads overlapping with the variant context. def get_acceptor_read_ids(self): + """Returns the variant context acceptor read identifiers + + Returns + ------- + list of str or None + Variant context acceptor read identifiers, None if there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return list(set([x.get_bam_read_id() for x in self.variant_context_areads])) - # Returns the list of left most acceptor read positions, def get_acceptor_read_starts(self): + """Returns the variant context acceptor read starting positions + + Returns + ------- + list of int or None + Variant context acceptor read leftmost genomic positions, None if there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.variant_context_areads] - # Returns a list of the left most positions of the R1 variant context accecptor BAM reads. def get_acceptor_read_left_positions(self): + """Returns the leftmost genomic positions of all variant context R1 acceptor reads. + + Returns + ------- + list of int or None + Variant context R1 acceptor read leftmost genomic positions, None of there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_pos() for x in self.variant_context_areads if x.is_read1()] - # Returns the list of all end positions for all variant context acceptor reads. def get_acceptor_read_ends(self): + """Returns the variant context acceptor read rightmost positions. + + Returns + ------- + list of int or None + Variant context R2 acceptor read rightmost genomic positions, None if there are no acceptor reads + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_end() for x in self.variant_context_areads] - # Returns a list of the right most positions ofr the R2 variant context acceptor BAM read. def get_acceptor_read_right_positions(self): + """Returns the rightmost genomic positions for all variant context R2 acceptor reads + + Returns + ------- + list of int or None + Variant context + """ if self.variant_context_areads is None: return [None] return [x.get_bam_read_ref_end() for x in self.variant_context_areads if x.is_read2()] # ===METHODS TO OBTAIN VARIANT CONTEXT DONOR READ DATA===================== - # Returns the number of variant context donor reads. def get_number_of_donor_reads(self): + """Returns the number of variant context donor reads. + + Returns + ------- + int + Number of variant context donor reads + """ return len(self.variant_context_dreads) - # Returns the identifiers of donor reads overlapping with the variant context. def get_donor_read_ids(self): + """Returns the identifiers of donor reads overlapping with the variant context. + + Returns + ------- + list of str + Variant context donor read identifiers + """ return list(set([x.get_bam_read_id() for x in self.variant_context_dreads])) - # Returns the list of variant context donor read starting positions. def get_donor_read_starts(self): + """Returns the list of variant context donor read starting positions. + + Returns + ------- + list of int + Variant context donor read leftmost genomic positions + """ return [x.get_bam_read_ref_pos() for x in self.variant_context_dreads] - # Returns the list of variant context donor read pairs left most positions (start pos of read 1). def get_donor_read_left_positions(self): + """Returns the list of variant context R1 donor read leftmost positions + + Returns + ------- + list of int + Variant context R1 donor read leftmost genomic positions + """ return [x.get_bam_read_ref_pos() for x in self.variant_context_dreads if (x.is_read1())] # Returns a list of all donor read ending positions def get_donor_read_ends(self): + """Returns variant context donor read rightmost positions. + + Returns + ------- + list of int + Variant context donor read rightmost genomic positions + """ return [x.get_bam_read_ref_end() for x in self.variant_context_dreads] - # Returns a list of all variant context donor reads right most positions (end pos of read 2). def get_donor_read_right_positions(self): + """Returns the list of variant context R2 donor reads + + Returns + ------- + list of int + Variant context R2 donor reads rightmost genomic positions + """ return [x.get_bam_read_ref_end() for x in self.variant_context_dreads if (x.is_read2())] # ===METHODS TO ADD DATA TO THE VARIANT CONTEXT============================ - # Sets the acceptor context of the variant context. def set_acceptor_context(self, acceptor_context): + """Sets the acceptor context of the variant context with the one provided. + + Parameters + ---------- + acceptor_context : OverlapContext + Acceptor context to add to the variant context + """ self.variant_acceptor_context = acceptor_context - # Sets the donor context of the variant context. def set_donor_context(self, donor_context): + """Sets the donor context of the variant context with the one provided. + + Parameters + ---------- + donor_context : OverlapContext + Donor context to add to the variant context + """ self.variant_donor_context = donor_context - # Adds an acceptor context to the variant context by creating it from data values. def add_acceptor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, acceptorreads): + """Sets the acceptor context of the variant context by constructing one from the provided parameters. + + Parameters + ---------- + contextid : str + Acceptor context identifier + sampleid : str + Sample name/identifier + contextchrom : str + Chromosome name the context is located on + contextorigin : int + Variant position the context is constructed from + contextstart : int + Leftmost genomic position of the acceptor context + contextend : int + Rightmost genomic position of the acceptor context + acceptorreads : list of DonorBamRead + Acceptor context reads + """ self.variant_acceptor_context = OverlapContext( contextid, sampleid, contextchrom, contextorigin, @@ -220,11 +426,29 @@ def add_acceptor_context(self, contextid, sampleid, acceptorreads ) - # Adds a donor context to the variant context by creating it from data values. def add_donor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, donorreads): + """Sets the donor context of the variant context by constructing one from the provided parameters. + + Parameters + ---------- + contextid : str + Donor context identifier + sampleid : str + Sample name/identifier + contextchrom : str + Chromosome name the context is located on + contextorigin : int + Variant genomic position the context is constructed from + contextstart : int + Leftmost genomic position of the donor context + contextend : int + Rightmost genomic position of the donor context + donorreads : list of DonorBamRead + Donor context reads + """ self.variant_donor_context = OverlapContext( contextid, sampleid, contextchrom, contextorigin, @@ -233,74 +457,189 @@ def add_donor_context(self, contextid, sampleid, ) # ===METHODS TO OBTAIN VARIANT CONTEXT UNMAPPED MATE READ DATA============= - # Returns the variant context acceptor read ids that have an unmapped mate. def get_unmapped_acceptor_read_ids(self): + """Returns the variant context acceptor read identifiers that have an unmapped mate. + + Returns + ------- + self.unmapped_acceptor_mate_ids : list of str + Variant context acceptor read identifiers with an unmapped mate + """ return self.unmapped_acceptor_mate_ids - # Returns the variant context donor read ids that have an unmapped mate. def get_unmapped_donor_read_ids(self): + """Returns the variant context donor read identifiers that have an unmapped mate. + + Returns + ------- + self.unmapped_donor_mate_ids : list of str + Variant context donor read identifiers with an unmapped mate + :return: + """ return self.unmapped_donor_mate_ids - # Adds a variant context appector mate identifier. def add_unmapped_acceptor_mate_id(self, mateid): + """Adds a variant context appector mate identifier. + + Parameters + ---------- + mateid : str + Variant context acceptor read identifier with an unmapped mate + """ self.unmapped_acceptor_mate_ids.append(mateid) - # Adds a variant context donor mate identifier. def add_unmapped_donor_mate_id(self, mateid): + """Adds a variant context donor mate identifier. + + Parameters + ---------- + mateid : str + Variant context donor read identifier with an unmapped mate + """ self.unmapped_donor_mate_ids.append(mateid) - # Sets the variant context unmapped acceptor mate ids. def set_unmapped_acceptor_mate_ids(self, mateids): + """Sets the variant context unmapped acceptor mate ids. + + Parameters + ---------- + mateids : list of str + Variant context acceptor read identifiers with unmapped mate + """ self.unmapped_acceptor_mate_ids = mateids - # Sets the variant context unmapped donor mate ids. def set_unmapped_donor_mate_ids(self, mateids): + """Sets the variant context unmapped donor mate ids. + + Parameters + ---------- + mateids : list of str + Variant context donor read identifiers with unmapped mates + """ self.unmapped_donor_mate_ids = mateids - # Returns whether a specified variant context acceptor read has an unmapped mate. def acceptor_read_has_unmapped_mate(self, readid): + """Returns whether a specified variant context acceptor read has an unmapped mate. + + Parameters + ---------- + readid : str + Acceptor read identifier + + Returns + ------- + bool + True if acceptor read has unmapped mate, False if not + """ return readid in self.unmapped_acceptor_mate_ids - # Returns whether a specified variant context donor read has an unmapped mate. def donor_read_has_unmapped_mate(self, readid): + """Checks and returns whether a specified variant context donor read has an unmapped mate. + + Parameters + ---------- + readid : str + Donor read identifier + + Returns + ------- + bool + True if donor read has unmapped mate, False if not + """ return readid in self.unmapped_donor_mate_ids - # Returns the number of variant context acceptor reads with an unmapped mate. def get_number_of_unmapped_acceptor_mates(self): + """Returns the number of variant context acceptor reads with an unmapped mate. + + Returns + ------- + int + Number of variant context acceptor reads with an unmapped mate. + """ return len(self.unmapped_acceptor_mate_ids) - # Returns the number of variant context donor reads with an unmapped mate. def get_number_of_unmapped_donor_mates(self): + """Returns the number of variant context donor reads with an unmapped mate. + + Returns + ------- + int + Number of variant context donor reads with an unmapped mate. + """ return len(self.unmapped_donor_mate_ids) # ===METHODS TO ADD UNMAPPED MATES TO THE ACCEPTOR AND DONOR CONTEXT======= - # Sets the unmapped mate ids for the acceptor context. def set_acceptor_context_unmapped_mates(self, mateids): + """Sets the unmapped mate ids for the acceptor context. + + Returns + ------- + mateids : list of str + Acceptor context read identifiers with an unmapped mate + """ self.variant_acceptor_context.set_unmapped_mate_ids(mateids) - # Adds an unmapped read id to the acceptor context. def add_acceptor_context_unmapped_mate(self, ureadid): + """Adds an unmapped read id to the acceptor context. + + Parameters + ---------- + ureadid : str + Acceptor context read identifier with an unmapped mate + """ self.variant_acceptor_context.add_unmapped_mate_id(ureadid) - # Sets the unmapped mate ids for the donor context. def set_donor_context_unmapped_mates(self, mateids): + """Sets the unmapped mate ids for the donor context. + + Parameters + ---------- + mateids : list of str + Donor context read identifiers with an unmapped mate + """ self.variant_donor_context.set_unmapped_mate_ids(mateids) - # Adds an unmapped read id to the donor context. - def add_donor_context_unmapped_mate_id(self, ureadid): - self.variant_donor_context.setUnmappedMateId(ureadid) + def add_donor_context_unmapped_mate(self, ureadid): + """Adds an unmapped read id to the donor context. + + Parameters + ---------- + ureadid : str + Donor context read identifier with an unmapped mate + """ + self.variant_donor_context.add_unmapped_mate_id(ureadid) # ===METHODS TO OBTAIN STATISTICS OF THE VARIANT CONTEXT=================== - # Returns the average and median quality of the acceptor reads associated with def get_average_and_median_acceptor_read_qual(self): + """Returns the average and median quality of the acceptor reads associated with. + + Returns + ------- + + """ return self.get_average_and_median_read_qual(self.variant_context_areads) - # Returns the average and median quality of the donor reads associated with this variant context. def get_average_and_median_donor_read_qual(self): + """Calculates and returns the mean and median variant context donor read Q-Score. + + Returns + ------- + list of int + Mean and median Q-Score + """ return self.get_average_and_median_read_qual(self.variant_context_dreads) - # Returns the average and median read quality. def get_average_and_median_read_qual(self, contextreads): + """Calculates and returns the mean and median read Q-Score. + + Parameters + ---------- + contextreads : list or reads + Reads to calculate mean and median Q-Score of + + Returns + ------- + """ if contextreads is not None: avgmedqual = [] for contextread in contextreads: @@ -309,16 +648,39 @@ def get_average_and_median_read_qual(self, contextreads): statistics.median(avgmedqual)]) return [None, None] - # Returns the average and median mapq values of the acceptor reads associated with this variant context. def get_average_and_median_acceptor_read_mapq(self): + """Calculates and returns the mean and median variant context acceptor read MAPQ value. + + Returns + ------- + list of int + Mean and median MAPQ, None if there are no acceptor reads + """ return self.get_average_and_median_read_mapq(self.variant_context_areads) - # Returns the average and median mapq values of the donor reads associated with this variant context. def get_average_and_median_donor_read_mapq(self): + """Calculates and returns the mean and median variant context donor read MAPQ value. + + Returns + ------- + list of int + Mean and median variant context donor read MAPQ, None if there are no donor reads + """ return self.get_average_and_median_read_mapq(self.variant_context_dreads) - # Returns the average and median read MapQ of this variant context. def get_average_and_median_read_mapq(self, contextreads): + """Calculates and returns the mean and median MAPQ value of provided reads. + + Parameters + ---------- + contextreads : list of DonorBamRead + Reads to calculate mean and median MAPQ of + + Returns + ------- + list of int + Mean and median read MAPQ, None if no reads provided + """ if contextreads is not None: avgmedmapq = [] for contextread in contextreads: @@ -327,16 +689,39 @@ def get_average_and_median_read_mapq(self, contextreads): statistics.median(avgmedmapq)]) return [None, None] - # Returns the average and median length of the acceptor reads associated with this variant context. def get_average_and_median_acceptor_read_length(self): + """Calculates and returns the mean and median variant context acceptor read length. + + Returns + ------- + list of int + Mean and median variant context acceptor read length, None if there are no acceptor reads + """ return self.get_average_and_median_read_length(self.variant_context_areads) - # Returns the average and median length of the donor reads associated with this variant context. def get_average_and_median_donor_read_length(self): + """Calculates and returns the mean and median variant context donor read length. + + Returns + ------- + list of int + Mean and median variant context donor read length, None if there are no donor reads + """ return self.get_average_and_median_read_length(self.variant_context_dreads) - # Returns the average and median read length. def get_average_and_median_read_length(self, contextreads): + """Calculates and returns the mean and median read length of a specified list of reads. + + Parameters + ---------- + contextreads : list of DonorBamReds + Reads to to calculate mean and median length of + + Returns + ------- + list of int + Mean and median read length, None if no reads are supplied + """ if contextreads is not None: avgmedlen = [] for contextread in contextreads: @@ -346,138 +731,335 @@ def get_average_and_median_read_length(self, contextreads): return [None, None] # ===METHODS TO OBTAIN ACCEPTOR CONTEXT DATA=============================== - # Returns whether the variant context has an acceptor context def has_acceptor_context(self): + """Returns whether the variant context has an acceptor context + + Returns + ------- + bool + True if variant context has an acceptor context, False if not + """ return self.variant_acceptor_context is not None - # Returns the acceptor context identifier (should be the same as the variant context id). def get_acceptor_context_id(self): + """Returns the acceptor context identifier. + + Returns + ------- + str + Acceptor context identifier + """ return self.variant_acceptor_context.get_context_id() - # Returns the acceptor context sample id (should be the same as the variant context sample id). def get_acceptor_context_sample_id(self): + """Returns the acceptor context sample id. + + Returns + ------- + str + Sample name/identifier of the acceptor context + """ return self.variant_acceptor_context.get_sample_id() - # Returns the chromosome of the acceptor context. def get_acceptor_context_chrom(self): + """Returns the chromosome name of the acceptor context. + + Returns + ------- + str + Chromosome name the acceptor context is located on + """ return self.variant_acceptor_context.get_context_chrom() - # Returns the origin position of the acceptor context. def get_acceptor_context_origin(self): + """Returns the origin position of the acceptor context. + + Returns + ------- + int + Variant genomic position the context is based on + """ return self.variant_acceptor_context.get_context_origin() - # Returns the starting position of the acceptor context. def get_acceptor_context_start(self): + """Returns the starting position of the acceptor context. + + Returns + ------- + int + Acceptor context leftmost genomic position + """ return self.variant_acceptor_context.get_context_start() - # Returns the ending position of the acceptor context. def get_acceptor_context_end(self): + """Returns the ending position of the acceptor context. + + Returns + ------- + int + Acceptor context rightmost genomic position + """ return self.variant_acceptor_context.get_context_end() - # Returns the length of the acceptor context. def get_acceptor_context_length(self): + """Returns the length of the acceptor context. + + Returns + ------- + int + Acceptor context length + """ return self.variant_acceptor_context.get_context_length() - # Returns the acceptor context reads. def get_acceptor_context_reads(self): + """Returns the acceptor context reads. + + Returns + ------- + list of DonorBamRead + Acceptor context reads + """ return self.variant_acceptor_context.get_context_bam_reads() - # Returns the lost of read ids in the acceptor context. def get_acceptor_context_read_ids(self): + """Returns the acceptor context read identifiers. + + Returns + ------- + list of str + Acceptor context read identifiers + """ return self.variant_acceptor_context.get_context_bam_read_ids() - # Returns a list of all acceptor context BAM read start positions. def get_acceptor_context_read_starts(self): + """Returns the leftmost genomic positions of the acceptor context reads + + Returns + ------- + """ return self.variant_acceptor_context.get_context_bam_read_starts() - # Returns a list of all acceptor context R1 BAM read left positons. def get_acceptor_context_read_left_positions(self): + """Returns the leftmost genomic positions of all R1 acceptor context reads + + Returns + ------- + list of int + Acceptor context leftmost genomic R1 read positions + """ return self.variant_acceptor_context.get_context_bam_read_left_positions() - # Returns a list of all acceptor context BAM read end positions. def get_acceptor_context_read_ends(self): + """Returns the rightmost genomic positions of all acceptor context reads. + + Returns + ------- + list of int + Acceptor context rightmost genomic read positions. + """ return self.variant_acceptor_context.get_context_bam_read_ends() - # Returns a list of all acceptor context R2 BAM read end positions. def get_acceptor_context_read_right_positions(self): + """Returns a list of all acceptor context R2 BAM read end positions. + + Returns + ------- + list of int + Acceptor context rightmost genomic R2 read positions + """ return self.variant_acceptor_context.get_context_bam_read_right_positions() - # Returns a list of all acceptor context BAM read lengths. def get_acceptor_context_read_lengths(self): + """Returns the lengths of the acceptor context reads. + + Returns + ------- + list of int + Acceptor context read lengths + """ return self.variant_acceptor_context.get_context_bam_read_lengths() - # Returns the list of acceptor context unmapped mate read ids def get_acceptor_context_unmapped_mate_ids(self): + """Returns the acceptor context read identifiers with unmapped mates. + + Returns + ------- + list of str + Acceptor context read identifiers with unmapped mates. + """ return self.variant_acceptor_context.get_unmapped_read_mate_ids() # ===METHODS TO OBTAIN DONOR CONTEXT DATA================================== - # Returns whether the variant context has a donor context def has_donor_context(self): + """Checks and returns whether the variant context has a donor context saved. + + Returns + ------- + bool + True if the variant context has a donor context, False if not + """ return self.variant_donor_context is not None - # Returns the donor context identifier (should be the same as the variant context id). def get_donor_context_id(self): + """Returns the donor context identifier. + + Returns + ------- + str + Donor context identifier + """ return self.variant_donor_context.get_context_id() - # Returns the acceptor context sample id (should be the same as the variant context sample id). def get_donor_context_sample_id(self): + """Returns the sample name/identifier of the donor context + + Returns + ------- + str + Donor context sample name/identifier + """ return self.variant_donor_context.get_sample_id() - # Returns the chromosome of the donor context. def get_donor_context_chrom(self): + """Returns the chromosome of the donor context. + + Returns + ------- + """ return self.variant_donor_context.get_context_chrom() - # Returns the origin position of the donor context. def get_donor_context_origin(self): + """Returns the origin position of the donor context. + + Returns + ------- + int + Variant genomic position that the context is constructed from. + """ return self.variant_donor_context.get_context_origin() - # Returns the starting position of the donor context. def get_donor_context_start(self): + """Returns the starting position of the donor context. + + Returns + ------- + int + Donor context leftmost genomic position + """ return self.variant_donor_context.get_context_start() - # Returns the ending position of the donor context. def get_donor_context_end(self): + """Returns the ending position of the donor context. + + Returns + ------- + int + Donor context rightmost genomic position + """ return self.variant_donor_context.get_context_end() - # Returns the length of the acceptor context. def get_donor_context_length(self): + """Returns the length of the donor context. + + Returns + ------- + int + Donor context length + """ return self.variant_donor_context.get_context_length() - # Returns the donor context reads. def get_donor_context_reads(self): + """Returns all donor context reads. + + Returns + ------- + list of DonorBamRead + Donor context reads + """ return self.variant_donor_context.get_context_bam_reads() - # Returns the donor context read identifers. def get_donor_context_read_ids(self): + """Returns the identifiers of all donor context reads. + + Returns + ------- + list of str + Donor context read identifiers + """ return self.variant_donor_context.get_context_bam_read_ids() - # Returns a list of donor context read start positions. def get_donor_context_read_starts(self): + """Returns the leftmost genomic read positions of all donor context reads. + + Returns + ------- + list of int + Donor context leftmost genomic read positions + """ return self.variant_donor_context.get_context_bam_read_starts() - # Returns a list of all acceptor context R1 BAM read left positons. def get_donor_context_read_left_positions(self): + """Returns the leftmost genomic positions of all R1 donor context reads. + + Returns + ------- + list of int + Donor context leftmost genomic R1 read positions + """ return self.variant_donor_context.get_context_bam_read_left_positions() - # Returns a list of donor context read end positions. def get_donor_context_read_ends(self): + """Returns the rightmost genomic positions of all donor context reads. + + Returns + ------- + list of int + Donor context rightmost genomic read positions + """ return self.variant_donor_context.get_context_bam_read_ends() - # Returns a list of all acceptor context R2 BAM read right positons. def get_donor_context_read_right_positions(self): + """Returns the rightmost genomic positions of all R2 donor context reads + + Returns + ------- + list of int + Donor context rightmost genomic R2 read positions + """ return self.variant_donor_context.get_context_bam_read_right_positions() - # Returns a list of donor context BAM read lengths. def get_donor_context_read_lengths(self): + """Returns the lengths of the donor context reads. + + Returns + ------- + list of int + Donor context read lengths + """ return self.variant_donor_context.get_context_bam_read_lengths() - # Returns the list of donor context unmapped mate read ids. def get_donor_context_unmapped_mate_ids(self): + """Returns the donor context read identifiers that have unmapped mates. + + Returns + ------- + list of str + Donor context read identifiers + """ return self.variant_donor_context.get_unmapped_read_mate_ids() # ===METHODS TO PRODUCE SOME OUTPUT ABOUT THE VARIANT CONTEXT============== - # Returns a varcon.txt string representation of the variant context. def to_string(self): + """Creates and returns the variant context as a String representation. + + The created String representation of the variant context is equal to the entry of a variant context file. Each + included data attribute is separated by a tab. + + Returns + ------- + str + Variant context as variant context file entry + """ if self.variant_context_areads is None: ad_ratio = "N/A" list_areads = None @@ -515,7 +1097,7 @@ def to_statistics_string(self): Returns ------- str - Basic variant context statistics + Basic tab separated variant context statistics """ areadlen = self.get_average_and_median_acceptor_read_length() dreadlen = self.get_average_and_median_donor_read_length() diff --git a/VariantContextFile.py b/VariantContextFile.py index 915f5de..ec3ab18 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -451,7 +451,11 @@ def get_variant_contexts2(self, aslist=False, varconfilter=None, # Main method that returns whether a variant (SNP or indel). def variant_is_in_context(self, varianttype, searchchrom, searchstart, searchstop): - """ + """Checks whether a variant is located in an already existing context. + + If the variant type is set to 'snp' the method will check and return whether the SNP overlaps with a variant + context. If the variant is set to 'indel' the method will check and return wheterh the area from start to end + of the indel overlaps with a variant context. Parameters ---------- @@ -467,7 +471,7 @@ def variant_is_in_context(self, varianttype, searchchrom, Returns ------- bool or None - + True if the variant overlaps with a context, False if not """ if varianttype == "snp": return self.snp_variant_is_in_context(searchchrom, searchstart) @@ -478,6 +482,23 @@ def variant_is_in_context(self, varianttype, searchchrom, # Determines whether an SNP variant is located in an already existing variant context. def snp_variant_is_in_context(self, varchrom, vcfvarpos): + """Checks and returns whether a SNP is located in an already existing variant context. + + For each variant context it is first checked whether the chromosome name of the context and SNP are the same. If + so, it is then checked whether the SNP genomic position is equal or larger than the context start and smaller or + equal than the context end. + + Parameters + ---------- + varchrom : str + Chromosome name the SNP is located on + vcfvarpos : int + Genomic position of the SNP + Returns + ------- + bool + True if the SNP is in a variant context, False if not + """ for varcon in self.variant_contexts.values(): if varchrom == varcon.get_variant_context_chrom(): if (vcfvarpos >= varcon.get_variant_context_start() @@ -488,6 +509,25 @@ def snp_variant_is_in_context(self, varchrom, vcfvarpos): # Determines whether an indel variant is located within an existing variant context (indelLeftPos and indelRightPos # can be the used search window). def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos): + """Checks and returns whether an indel is located in an already existing variant context. + + For each variant context it is first checked whether the chromosome name of the context and indel are the same. + If so, it is than checked whether the indel range from indel start to end overlaps with the context. + + :Parameters + ----------- + indelchrom : str + The chromosome name the indel is located on + indelleftpos : int + The leftmost genomic position of the indel + indelrightpos : int + The rightmost genomic position of the indel + + Returns + ------- + bool + True if the indel overlaps with a variant context, False if not + """ for varcon in self.variant_contexts.values(): if (indelchrom == varcon.get_variant_context_chrom()): if (indelleftpos <= varcon.get_variant_context_start() @@ -503,6 +543,18 @@ def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos): # Checks whether a context is located in an already existing context def context_collision(self, context_arr): + """Checks and returns whether a potential context overlaps with an already existing variant context. + + Parameters + ---------- + context_arr: list + Essential context data (chrom, variant pos, start, end) + + Returns + ------- + bool + True if provided context overlaps with an already existing context, False if not + """ if f"{context_arr[0]}_{context_arr[1]}" in self.variant_contexts: return True else: @@ -516,6 +568,16 @@ def context_collision(self, context_arr): # ===METHODS TO ADD DATA/VARIANT CONTEXTS TO THE VARIANT CONTEXT FILE====== # Sets a variant context object to a specified context id. def set_variant_context(self, varconid, varcontext): + """Sets a provided variant context to the provided context identifier. Will overwrite the previous value for + the provided context identifier. + + Parameters + ---------- + varconid : str + Identifier of the variant context to add + varcontext: VariantContext + The VariantContext to set + """ self.variant_contexts[varconid] = varcontext # Adds a variant context object @@ -524,6 +586,29 @@ def add_variant_context(self, varconid, sampleid, varconstart, varconend, varcon_areads, varcon_dreads, acceptor_context=None, donor_context=None): + """ + + Parameters + ---------- + varconid : str + Identifier of the variant context + sampleid : str + Identifier of the sample + varconchrom : str + Chromosome name of the variant context + varconorigin : int + The variant position the context is based on + varconstart : int + + varconend : int + varcon_areads : list of DonorBamReads + varcon_dreads : list of DonorBamReads + + acceptor_context : OverlapContext + Acceptor context belonging to the variant context + donor_context : OverlapContext + Donor context belonging to the variant context + """ varcon_obj = VariantContext(varconid, sampleid, varconchrom, varconorigin, varconstart, varconend, @@ -578,7 +663,7 @@ def add_acceptor_context(self, contextid, sampleid, contextchrom: str Chromosome name the context is located on contextorigin: int - Variant genomic position that created the context + Variant genomic position the context is based on contextstart: int Leftmost genomic position of the context contextend: int @@ -595,7 +680,7 @@ def add_acceptor_context(self, contextid, sampleid, # Sets the donor context object of a variant context def set_donor_context(self, varconid, donor_context): - """Adds an already existing context + """Adds an already existing donor context to a specified variant context. Parameters ---------- diff --git a/VcfBamScanner.py b/VcfBamScanner.py index ac1e627..e9e589e 100644 --- a/VcfBamScanner.py +++ b/VcfBamScanner.py @@ -10,8 +10,18 @@ class VcfBamScanner: The VcfBamScanner can be used to scan folders for variant and alignments files, check whether variant and alignment files have sample names and extract them. A list file containing paths to either variant or alignment files can - also be.scanned. Files within the list files will be a saved per sample name in an identifier.""" - # Constructor that creates two empty hashmaps (dicts). + also be.scanned. Files within the list files will be a saved per sample name in an identifier. + + Attributes + ---------- + vcf_sample_map : dict + Dictionary with variant files per sample + bam_sample_map : dict + Dictionary with alignment files per sample + valid_file_types : list of str + Valid variant and alignment file types + """ + def __init__(self): """Obtains the vase logger and creates two empty dictionaries to save the variant and alignment files.""" self.vaselogger = logging.getLogger("VaSe_Logger") @@ -20,7 +30,6 @@ def __init__(self): self.valid_file_types = ["VCF", "BCF", "BAM", "CRAM"] # ===METHODS TO GET SAVED DATA FROM THE VCFBAMSCANNER====================== - # Returns the map that links VCF files and samples. def get_vcf_sample_map(self): """Returns a dictionary with VCF/BCF files saved per sample. @@ -31,7 +40,6 @@ def get_vcf_sample_map(self): """ return self.vcf_sample_map - # Returns the map that links BAM files and samples. def get_bam_sample_map(self): """Returns a dictionary with BAM/CRAM files saved per sample. @@ -42,9 +50,8 @@ def get_bam_sample_map(self): """ return self.bam_sample_map - # Returns a map that links VCF files to BAM files. def get_vcf_to_bam_map(self): - """Returns a dictionary with a variant file linked to an alignment file. + """Returns a dictionary with a variant file linked to an alignment file of the same sample. For each sample, the associated variant file is linked to the associated.alignment file are linked via a dictionary. The variant and alignment file are saved as key and value respectively. @@ -115,7 +122,6 @@ def scan_vcf_folders(self, vcffolders): self.vaselogger.info("Finished scanning VCF files") return self.vcf_sample_map - # Scans the folders containing BAM files and returns a map that links sample ids with bam files. def scan_bam_folders(self, bamfolders): """Scans a list of folders containing alignment files and returns a dictionary with valid files per sample name. @@ -201,8 +207,21 @@ def scan_vcf_files(self, vcflistfile): self.vcf_sample_map = self.read_donorlistfile(vcflistfile, "v") return self.vcf_sample_map - # Read a specific donor list file def read_donorlistfile(self, listfileloc, listtype="a"): + """Reads a list file containing paths to donor files and returns the donor files per sample name. + + Parameters + ---------- + listfileloc : str + Path to the list file containing paths to donor files + listtype : str + Type of variant files in the list file (a=alignment ; v=variant) + + Returns + ------- + donor_sample_files : dict + Paths to donor variant files per sample name + """ donor_sample_files = {} try: with open(listfileloc, "r") as listfile: @@ -221,7 +240,6 @@ def read_donorlistfile(self, listfileloc, listtype="a"): exit() return donor_sample_files - # Checks whether the BAM file contains a sample name. def bam_has_sample_name(self, bamfile): """Checks and returns whether a BAM/CRAM file has a sample name/identifier. @@ -233,7 +251,7 @@ def bam_has_sample_name(self, bamfile): Returns ------- bool - True, if file has a sample name, otherwise False + True if file has a sample name, otherwise False """ if "RG" in bamfile.header: if len(bamfile.header["RG"]) > 0: @@ -241,7 +259,6 @@ def bam_has_sample_name(self, bamfile): return True return False - # Checks whether the VCF file has a sample name def vcf_has_sample_name(self, vcffile): """Checks and returns whether the VCF/BCF file has a sample name/identifier. @@ -259,6 +276,7 @@ def get_vcf_sample_name(self, vcffileloc): """Extracts and returns the list of sample names from a specified VCF file. Parameters + ---------- vcffileloc : str Path to a VCF file @@ -272,9 +290,8 @@ def get_vcf_sample_name(self, vcffileloc): return list(vcffile.header.samples) return None - # Returns the BAM sample def get_bam_sample_name(self, bamfileloc): - """Extracts the sample name/identifier from a BAM file. + """Extracts and returns the sample name/identifier from a BAM file. The sample name is extracted by first checking whether the BAM file contains a sample name via the VcfBamScanner method bam_has_sample_name(). If so, the sample name is extracted from the BAM file by returning @@ -301,9 +318,8 @@ def get_bam_sample_name(self, bamfileloc): except IOError: self.vaselogger.warning(f"Could not open {bamfileloc} to extract sample identifier") - # Returns the CRAM sample name def get_cram_sample_name(self, cramfileloc): - """Extracts the sample name/identifier from a specified CRAM file. + """Extracts and returns the sample name/identifier from a specified CRAM file. The sample name is extracted by first checking whether the CRAM file contains a sample name via the VcfBamScanner method cram_has_sample_name(). If so, the sample name is extracted from the CRAM file by @@ -315,7 +331,6 @@ def get_cram_sample_name(self, cramfileloc): cramfileloc : str Path to a CRAM file - Returns ------- str or None @@ -332,7 +347,20 @@ def get_cram_sample_name(self, cramfileloc): # General methods that returns one or more sample ids (in case of a VCF) def get_sample_id(self, donorfileloc, donorlisttype): - """Returns the sample name/identifier for a specified alignemt or variant file.""" + """Returns the sample name/identifier for a specified alignment or variant file. + + Parameters + ---------- + donorfileloc : str + Path to a donor variant/alignment file + donorlisttype : str + Type of donor file (a=alignment ; v=variant) + + Returns + ------- + str + File type of provided donor file + """ donor_file_type = self.get_donor_file_type(donorfileloc) if donorlisttype == "a": if donor_file_type[1] == "BAM": @@ -393,12 +421,12 @@ def get_alignment_sequence_names(self, alignment_file): sequence_names.add(sn_entry["SN"]) return sequence_names - # Extracts the sequence names from an already opened VCF/BCF file def get_variant_sequence_names(self, variant_file): """Extracts and returns the chromosome names from a variant file. Chromosome names are extracted from a specified VCF or BCF file by obtaining the header contig names via the - pysam method .header.contigs(). + pysam method .header.contigs(). The provided variant file is expected to be an already opened pysam + VariantFile. Parameters ---------- diff --git a/VcfVariant.py b/VcfVariant.py index f705a37..5a0c213 100644 --- a/VcfVariant.py +++ b/VcfVariant.py @@ -2,6 +2,23 @@ class VcfVariant: + """Saves necessary data of a genomic variant from a VCF/BCF file. + + Attributes + ---------- + vcf_variant_chrom : str + Chromosome name the variant is located on + vcf_variant_start : int + The leftmost genomic position of the variant + vcf_variant_ref : str + The reference allele of the variant + vcf_variant_alts : tuple + The alternative allele(s) of the variant + vcf_variant_filter : str + Filter applied to the variant (i.e. PASS) + vcf_variant_type : str + Type of the variant (SNP/Indel/etc) + """ def __init__(self, varchrom, varstart, varref, varalts, varfilter, vartype): self.vcf_variant_chrom = varchrom self.vcf_variant_start = varstart @@ -10,59 +27,147 @@ def __init__(self, varchrom, varstart, varref, varalts, varfilter, vartype): self.vcf_variant_filter = varfilter self.vcf_variant_type = vartype - # Returns the chromosome name of the VCF variant def get_variant_chrom(self): + """Returns the chromosome name the variant is located on + + Returns + ------- + self.vcf_variant_chrom : str + Chromosome name the variant is located on + """ return self.vcf_variant_chrom - # Sets the VCF variant chromosome def set_variant_chrom(self, varchrom): + """Sets the chromosome name the variant is located on. Overwrites an already set chromosome name. + + Parameters + ---------- + varchrom : str + Chromosome name the variant is located on + :return: + """ self.vcf_variant_chrom = varchrom - # Returns the VCF variant start position def get_variant_pos(self): + """Returns the genomic position of the variant. + + Returns + ------- + self.vcf_variant_pos : nt + Genomic position of the variant + """ return self.vcf_variant_start - # Sets the VCF variant starting position def set_variant_pos(self, varpos): + """Sets the variant genomic position. Overwrites an already set genomic position. + + Parameters + ---------- + varpos : int + Genomic position of the variant. + """ self.vcf_variant_start = varpos - # Returns the VCF variant reference allele def get_variant_ref_allele(self): + """Returns the reference allele of the variant. + + Returns + ------- + self.vcf_variant_ref : str + Reference allele of the variant + :return: + """ return self.vcf_variant_ref - # Sets the VCF variant reference allele def set_variant_ref_allele(self, varref): + """Sets the reference allele of the variant. Overwrites an already set reference allele. + + Parameters + ---------- + varref : str + Reference allele of the variant + """ self.vcf_variant_ref = varref - # Returns the VCF variant alternative alleles def get_variant_alt_alleles(self): + """Returns the alternative allele(s) of the variant. + + Returns + ------- + self.vcf_variant_alts : tuple + Alternative allele(s) of the variant + """ return self.vcf_variant_alts - # Sets the VCF variant alternative alleles def set_variant_alt_alleles(self, varalts): + """Sets the alternative allele(s) of the variants. Overwrites an already set alternative allele(s). + + Parameters + ---------- + varalts : tuple + Alternative allele(s) of the variant + """ self.vcf_variant_alts = varalts - # Returns the VCF variant filter def get_variant_filter(self): + """Returns the filter (i.e. PASS) that was applied to the variant + + Returns + ------- + self.vcf_variant_filter : str + + """ return self.vcf_variant_filter - # Sets the VCF variant filter def set_variant_filter(self, varfilter): + """Sets the filter applied to the variant. Overwrites any already set variant filter. + + Parameters + ---------- + varfilter : str + + """ self.vcf_variant_filter = varfilter - # Returns the VCF variant type (SNP/Indel, etc) def get_variant_type(self): + """Returns the variant type. + + Returns + ------- + self.vcf_variant_type : str + Variant type + """ return self.vcf_variant_type - # Sets the VCF variant type def set_variant_type(self, vartype): + """Sets the type of the variant. Overwrites any already set variant type. + + Parameters + ---------- + vartype : str + """ self.vcf_variant_type = vartype - # Returns a variant identifier in the form CHROM_POS def get_variant_id(self): + """Constructs and returns a variant identifier. + + The identifier is formed by combining the chromosome name and variant position, separated by an '_' + (i.e. CHROM_POS). This will be used as the identifier for contexts. + + Returns + ------- + str + Constructed variant identifier + """ return f"{self.vcf_variant_chrom}_{self.vcf_variant_start}" - # ToString method def to_string(self): + """Returns a String representation of the variant. + + Returns + ------- + str + String representation of the variant + """ return f"{self.vcf_variant_chrom}\t{self.vcf_variant_start}\t{self.vcf_variant_type}\t{self.vcf_variant_ref}" \ f"\t{self.vcf_variant_alts}\t{self.vcf_variant_filter}" From 931998122572f73b15d213734c234a09551a3ef6 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 11 Sep 2019 15:25:00 +0200 Subject: [PATCH 20/90] Add more docstrings and method to write p-mode link file --- DonorBamRead.py | 16 +-- VaSeBuilder.py | 293 ++++++++++++++++++++++++++++++++++++++++++---- VariantContext.py | 1 - vase.py | 37 ++++-- 4 files changed, 304 insertions(+), 43 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 9065644..1b2dd5b 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -5,7 +5,7 @@ class DonorBamRead: """The DonorBamRead is used to save data from an aligned read extracted from a BAM or CRAM file. DonorBamRead can be used to save the data from aligned reads that were extracted from a BAM or CRAM file.""" - # Saves the required BAM read data. + def __init__(self, readid, readpn, readchrom, readstart, readlen, readseq, readquals, mapqual): """Saves all required data of an aligned read from a BAM or CRAM file. @@ -39,7 +39,6 @@ def __init__(self, readid, readpn, readchrom, readstart, self.bam_read_map_qual = mapqual # ===METHODS TO GET SAVED DATA FROM THE DONORBAMREAD======================= - # Returns the BAM read identifier. def get_bam_read_id(self): """Returns the identifier of the read. @@ -50,7 +49,6 @@ def get_bam_read_id(self): """ return self.bam_read_id - # Returns the BAM read pair number (1 or 2). def get_bam_read_pair_number(self): """Returns the pair number of the read. This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads. @@ -62,7 +60,6 @@ def get_bam_read_pair_number(self): """ return self.bam_read_pairnum - # Returns the BAM read chromosome. def get_bam_read_chrom(self): """Returns the name of the chromosome where the read has been mapped. @@ -73,7 +70,6 @@ def get_bam_read_chrom(self): """ return self.bam_read_chrom - # Returns the BAM read starting position on the reference sequence. def get_bam_read_ref_pos(self): """Returns the genomic mapping position (the left most position) of the read. @@ -84,7 +80,6 @@ def get_bam_read_ref_pos(self): """ return self.bam_read_ref_pos - # Returns the BAM read length. def get_bam_read_length(self): """Returns the length of the read. In VaSeBuilder this length is calculated by means of the CIGAR string by using the pysam method infer_read_length() @@ -96,7 +91,6 @@ def get_bam_read_length(self): """ return self.bam_read_length - # Returns the BAM read ending position on the reference (calculated as starting position + the length of the read). def get_bam_read_ref_end(self): """Calculates and returns the rightmost genomic position of the read via leftmost position + read length. @@ -129,7 +123,6 @@ def get_bam_read_qual(self): """ return self.bam_read_qual - # Returns the BAM read quality as an array of Q-Scores. def get_bam_read_q_scores(self): """Converts the String of ASCII quality scores to a list Q-Scores. The Q-Score for each ASCII quality character are calculated by obtaining the unicode code point and subtracting 33. @@ -144,7 +137,6 @@ def get_bam_read_q_scores(self): qscores.append(ord(qualSymbol)-33) return qscores - # Returns the maping quality of the BAM read. def get_mapping_qual(self): """Returns the mapping quality that was assigned to the read. @@ -156,7 +148,6 @@ def get_mapping_qual(self): return self.bam_read_map_qual # ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD=================== - # Returns the average Q-Score. def get_average_qscore(self): """Calculates and returns the mean Q-Score of the read. @@ -168,7 +159,6 @@ def get_average_qscore(self): qscores = self.get_bam_read_q_scores() return statistics.mean(qscores) - # Returns the median Q-Score. def get_median_qscore(self): """Calculates and returns the median Q-Score of the read. @@ -181,7 +171,6 @@ def get_median_qscore(self): return statistics.median(qscores) # ===METHODS TO CHECK WHETHER THE BAM READ IS R1 OR R2===================== - # Returns if the BAM read is the first (forward) read. def is_read1(self): """Returns whether the read is the forward/R1 read by checking if the pair number is set to '1'. @@ -192,7 +181,6 @@ def is_read1(self): """ return self.bam_read_pairnum == "1" - # Returns if the BAM read is the second (reverse) read. def is_read2(self): """Returns whether the read is the reverse/R2 read by checking if the pair number is set to '2'. @@ -204,7 +192,6 @@ def is_read2(self): return self.bam_read_pairnum == "2" # ===METHODS TO RETURN A STRING REPRESENTATION OF THE DONORBAMREAD OBJECT== - # Returns a String representation. def to_string(self): """Returns a String with all the saved data, separated by tabs, of the read. @@ -222,7 +209,6 @@ def to_string(self): + str(self.bam_read_qual) + "\t" + str(self.bam_read_map_qual)) - # Returns the BAM read as a fastq sequence. def get_as_fastq_seq(self, addpairnum=False): """Returns the read as a FastQ entry. diff --git a/VaSeBuilder.py b/VaSeBuilder.py index e0f4399..eaabb0d 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -80,6 +80,30 @@ def build_varcon_set(self, sampleidlist, variant locus, and read IDs per variant in an external file, along with other relevant information. Optionally outputs additional statistics files. + + Parameters + ---------- + sampleidlist : list of str + Sample names/identifiers to process + vcfsamplemap : dict + Variant files per sample + bamsamplemap : dict + Alignment files per sample + acceptorbamloc : str + Path to the alignment file to use as acceptor + outpath : str + Path to folderto write outut files to + reference_loc : str + Path to genome reference fasta file + varcon_outpath : str + Path and suffix to write variant context output file to + variant_list : str + Path to file containing variants to process + + Returns + ------- + self.contexts : dict + Variant contexts per variant context identifier """ self.vaselogger.info("Begin building the variant context file.") donor_vcfs_used = [] @@ -401,6 +425,12 @@ def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc): set from a pre-built variant context file. Does not output a new variant context file, and does not store any read information for the acceptor except for read IDs per variant. + + Parameters + ---------- + varc_file + bamsamplemap + reference_loc """ # Reads in an existing variant context file. self.contexts = VariantContextFile(varc_file) @@ -491,8 +521,26 @@ def build_validation_set(self, run_mode, acceptor_bam, self.vaselogger.info("Finished writing FastQ files.") return - # Checks whether a value is in a filter list (array or set) def passes_filter(self, val_to_check, filter_to_use, is_exclude_filter=False): + """Checks whether a value is in a filter list. + + A value can be checked against either an inclusion or exclusion filter. An inclusion filter should be the values + that should be included and used, an exclusion filter for values to be excluded and not used. + + Parameters + ---------- + val_to_check : str + Value to check against filter + filter_to_use : list of str + Values to use as filter + is_exclude_filter : bool + Is the filter an exclusion filter (false for inclusion filter) + + Returns + ------- + bool + True if value in an inclusive filter or not in an exclusive filter, False otherwise + """ if filter_to_use is not None: if is_exclude_filter: if val_to_check in filter_to_use: @@ -674,10 +722,24 @@ def filter_outliers(self, pos_list, k=3): if ((q1 - (k * iq)) <= x <= (q3 + (k * iq)))] return filtered - # Determines the size of the variant context based on both the - # acceptor and donor reads. def determine_largest_context(self, contextorigin, acceptor_context, donor_context): + """Determines the size of the variant context based on both the acceptor and donor reads. + + Parameters + ---------- + contextorigin : int + Variant position that the context is based on + acceptor_context : list of str and int + Essential acceptor context data + donor_context : list of str and int + Essential donor context data + + Returns + ------- + list of str and int + Context (chromosome, variant pos, start, end) + """ largest_context = [donor_context[0]] largest_context.append(contextorigin) # Determine and save the smallest context start. @@ -756,8 +818,8 @@ def write_vase_fastq(self, acceptor_infq, fastq_outpath, def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath): # Write the new VaSe FastQ file. - vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1) - fqgz_outfile = io.BufferedWriter(open(vasefq_outname, "wb")) + # vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1) + fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) donorbamreaddata.sort(key=lambda dbr: dbr[0], reverse=False) for bamread in donorbamreaddata: @@ -784,16 +846,28 @@ def set_fastq_out_path(self, outpath, fr, lnum): return f"{outpath}_{datetime.now().date()}_L{lnum}_R2.fastq" # ===METHODS TO OBTAIN SOME DATA OF THE VASEBUILDER OBJECT================= - # Returns the identifier of the current VaSeBuilder object. def get_creation_id(self): + """Returns the identifier of the current VaSeBuilder object. + + Returns + ------- + self.creation_id : str + VaSeBuilder creation identifier + """ return self.creation_id # Returns the date the current VaSeBuilder object has been made. # def get_creation_date(self): # return self.creation_date - # Returns the time the current VaSeBuilder object has been made. def get_creation_time(self): + """Returns the date and time the current VaSeBuilder object has been made. + + Returns + ------- + str + VaSeBuilder creation date and time + """ return self.creation_time # Returns all variant contexts. @@ -930,8 +1004,16 @@ def write_optional_output_files(self, outpath, contextfile): f"{outpath}varcon_positions_donor.txt" ) - # Writes a BED file for the variant context data def write_bed_file(self, variantcontextdata, bedoutloc): + """Writes variant contexts as a BED file + + Parameters + ---------- + variantcontextdata : list of VariantContext + Variants contexts to write to BED file + bedoutloc : str + Path to write the BED file to + """ try: with open(bedoutloc, "w") as bedoutfile: for varcon in variantcontextdata: @@ -988,16 +1070,42 @@ def add_donor_fastq3(self, opened_outfile, donorfastqfile, donoridlist): finally: return donoridlist - # Builds a validation set using existing donor fastq files def build_validation_from_donor_fastqs(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath): + """Builds a validation set using existing donor fastq files. + + Parameters + ---------- + afq1_in : list of str + R1 acceptor fastq files to use as template + afq2_in : list of str + R2 acceptor fastq files to use as template + dfq1_in : list of str + R1 donor fsatq files to add + dfq2_in : list of str + R2 donor fastq files to add + varconfileloc : str + Path to existing variant context file to build validation set + outpath :str + """ varconfile = VariantContextFile(varconfileloc) skip_list = varconfile.get_all_variant_context_acceptor_read_ids() skip_list.sort() for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]): self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) - # Splits a list of donor fastq files over a number of acceptors def divide_donorfastqs_over_acceptors(self, list_of_donorfqs, num_of_acceptors): + """Divides a list of donor fastq files over a number of acceptor fastq files. + + Donor fastq files are divided equally into a specified number of groups. The number of groups is specified by + the number of acceptor fastq files to use as template. + + Parameters + ---------- + list_of_donorfqs: list of str + Paths to donor fastq files + num_of_acceptors: int + Number of template fastq files + """ cycler = 0 split_donors = [[] for x in range(num_of_acceptors)] copy_list_donorfqs = list(list_of_donorfqs) @@ -1057,8 +1165,26 @@ def run_a_mode(self, afq1_in, afq2_in, dfq1_in, dfq2_in, varconfileloc, outpath) for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [dfq1_in, dfq2_in]): self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) - # A-mode: Filters acceptors and adds already existing donor fastq files def run_ac_mode(self, afq1_in, afq2_in, dfqs, varconfile, outpath): + """Runs VaSeBuilder AC-mode. + + This run mode builds a set of validation fastq files by adding already existing fastq files to acceptor fastq + files used as template. The variant context file used to construct the variant contexts of the donor fastq files + is required to filter out the acceptor reads.to be replaced with the donor data. + + Parameters + ---------- + afq1_in : list of str + R1 acceptor fastq files to use as template + afq2_in : list of str + R2 acceptor fastq files to use as template + dfqs : list of list of str + R1 and R2 donor fastq files to add + varconfile: VariantContextFile + Variant context to use for filtering out acceptor reads + outpath: str + Path to folder to write the output to + """ self.vaselogger.info("Running VaSeBuilder A-mode") # Split the donor fastqs into an R1 and R2 group r1_dfqs = [dfq[0] for dfq in dfqs] @@ -1070,8 +1196,19 @@ def run_ac_mode(self, afq1_in, afq2_in, dfqs, varconfile, outpath): for i, afq_i, dfq_i in zip(["1", "2"], [afq1_in, afq2_in], [r1_dfqs, r2_dfqs]): self.build_fastqs_from_donors(afq_i, dfq_i, skip_list, i, outpath) - # D-mode: Create/Read variant contexts and output only donor fastq files def run_d_mode(self, variantcontextfile, fq_out): + """Runs VaSeBuilder D-mode. + + This run mode produces one R1 and R2 file with donor reads of all variant contexts. This mode does not produce + validation fastq files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variants context to use + fq_out : str + Path and suffix to write donor fastq files to + """ self.vaselogger.info("Running VaSeBuilder D-mode") # Combine all donor reads from all variant contexts add_list = variantcontextfile.get_all_variant_context_donor_reads() @@ -1080,13 +1217,29 @@ def run_d_mode(self, variantcontextfile, fq_out): for i in ["1", "2"]: self.vaselogger.info(f"Start writing the R{i} FastQ files.") fq_starttime = time.time() - self.build_donor_fq(add_list, i, fq_out) + fqoutpath = self.set_fastq_out_path(fq_out, i, 1) + self.build_donor_fq(add_list, i, fqoutpath) self.vaselogger.info(f"Wrote all R{i} FastQ files.") self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") self.vaselogger.info("Finished writing donor FastQ files.") - # F-mode: Produce complete validation fastq files def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): + """Runs VaSeBuilder F-mode. + + This run mode creates a full set of validation fastq files from established variant contexts and a set of + acceptor fastq files to use as template to create the validation fastq files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variant contexts to use + fq1_in : list of str + R1 fastq files to use as template + fq2_in : list of str + R2 fastq files to use as template + fq_out : list of str + Path and suffix to write validation fastq files to + """ self.vaselogger.info("Running VaSeBuilder F-mode") # Combine all donor reads from all variant contexts @@ -1103,21 +1256,66 @@ def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") self.vaselogger.info("Finished writing FastQ files.") - # P-mode: Produce donor fastq files for each variant context - def run_p_mode(self, variantcontextfile, fq_out): + def run_p_mode(self, variantcontextfile, outpath, fq_out): + """Runs VaSeBuilder P-mode. + + This run mode produces an R1 and R2 fastq file with donor reads for each variant context. This mode does not + create or write validation fastq files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Established variant contexts + outpath : str + Path to folder to write output files to + fq_out : str + Path and suffix to write fastq out files to + """ + context_fq_links = {} + self.vaselogger.info("Running VaSeBuilder P-mode") self.vaselogger.info("Begin writing variant FastQ files.") variantcontexts = variantcontextfile.get_variant_contexts() for context in variantcontexts: add_list = context.get_donor_read_strings() self.vaselogger.debug(f"Writing variant FastQs for variant {context.context_id}.") - self.build_donor_fq(add_list, "1", fq_out + context.context_id) - self.build_donor_fq(add_list, "2", fq_out + context.context_id) + + r1_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "1", 1) + r2_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "1", 2) + self.build_donor_fq(add_list, "1", r1_donorfq) + self.build_donor_fq(add_list, "2", r2_donorfq) + context_fq_links[context.context_id] = [r1_donorfq, r2_donorfq] self.vaselogger.info("Finished writing variant FastQ files.") - # X-mode: Only create variant contexts. + # Write the P-mode link file (links context to fastq files for the current run) + self.write_pmode_linkfile(outpath, context_fq_links) + def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomereference, outdir, varconout, variantlist): + """Runs VaSeBuilder X-mode. + + This run mode only produces the variant contexts and writes the associated output files. This mode does not + create or write validation fastq files. + + Parameters + ---------- + sampleidlist : list of str + Names/identifiers of samples to process + donorvcfs : dict + Donor variant files per sample + donorbams : dict + Donor alignment files per sample + acceptorbam : str + Path to alignment file to use as acceptor + genomereference : str + Path to genome reference fasta file + outdir : str + Path to write output files to + varconout : str + Path to folder to write the variant context file to + variantlist : dict + Variants to process per sample + """ self.vaselogger.info("Running VaSeBuilder X-mode") self.build_varcon_set(sampleidlist, donorvcfs, donorbams, acceptorbam, outdir, genomereference, varconout, variantlist) @@ -1245,15 +1443,51 @@ def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, # Sets the variant list for a sample if applicable, otherwise return None def bvcs_set_variant_filter(self, sampleid, variant_list): + """Sets the variant list for a sample if applicable, otherwise return None + + Parameters + ---------- + sampleid : str + Sample name/identifier + variant_list : dict + Variant tuples with chromosome name and genomic position per sample + + Returns + ------- + list of tuple + Variants tuples with chromosome name and position + """ sample_variant_filter = None if variant_list is not None: if sampleid in variant_list: sample_variant_filter = variant_list[sampleid] return sample_variant_filter - # Writes the output files def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, vcfsamplemap, bamsamplemap, used_donor_vcfs, used_donor_bams): + """Writes VaSeBuilder output files. + + These output files are the standard output files when building a variant context file. Output files include + the variant context file, variant context statistics file, variant context BED file, used donor variant files + and used donor alignment files. + + Parameters + ---------- + outpath: str + Path to folder to write the output files to + varcon_outpath : str + Path to write variant context file to + variantcontextfile : VariantContextFile + Variant contexts to write to file + vcfsamplemap : dict + Variant files per sample + bamsamplemap : dict + Alignment files per sample + used_donor_vcfs : list of str + Donor variant files used to create the variant contexts + used_donor_bams : list of str + Donor alignment files used to create the variant contexts + """ # Write the variant context file itself self.vaselogger.info(f"Writing variant contexts to {varcon_outpath}") variantcontextfile.write_variant_context_file(varcon_outpath) @@ -1273,3 +1507,22 @@ def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, v # Write a listfile of the used alignment (BAM/CRAM) files self.vaselogger.info(f"Writing the used donor alignment files per sample to {outpath}donor_alignment_files.txt") self.write_used_donor_files(f"{outpath}donor_alignment_files.txt", bamsamplemap, used_donor_bams) + + def write_pmode_linkfile(self, outpath, context_fqs_links): + """Writes the link from variant context identifier to fastq output files for P-mode. + + Parameters + ---------- + outpath : str + Path to output folder to write the link file to. + context_fqs_links : dict of str: list + Fastq files per variant context identifier + """ + plinkloc = f"{outpath}plink_{self.creation_id}.txt" + try: + with open(plinkloc, "w") as plinkfile: + plinkfile.write(f"# VBUUID: {self.creation_id}") + for contextid, fqfiles in context_fqs_links.items(): + plinkfile.write(f"{contextid}\t{fqfiles[0]}\t{fqfiles[1]}\n") + except IOError: + self.vaselogger.warning(f"Could not write P-mode link file {plinkloc} for run {self.creation_id}") diff --git a/VariantContext.py b/VariantContext.py index 29a87c3..0083918 100644 --- a/VariantContext.py +++ b/VariantContext.py @@ -31,7 +31,6 @@ class VariantContext: Identifiers of reads with an unmapped mate """ - # Sets the variant context data. def __init__(self, varconid, sampleid, varconchrom, varconorigin, varconstart, varconend, diff --git a/vase.py b/vase.py index 99789f7..7509bbf 100644 --- a/vase.py +++ b/vase.py @@ -110,9 +110,14 @@ def start_logger(self, paramcheck, logloc, debug_mode=False): vaselogger.addHandler(vase_file_handler) return vaselogger - # Returns the vase Parameters. def get_vase_parameters(self): - """Creates a command line argument parser and returns the parameter values.""" + """Creates a command line argument parser and returns the parameter values. + + Returns + ------- + vase_args : dict + Command line parameters and set values + """ # Set the VaSe parameters for the program. vase_argpars = argparse.ArgumentParser() vase_argpars.add_argument("-m", "--runmode", dest="runmode", default="F", choices=self.valid_runmodes, @@ -149,7 +154,6 @@ def get_vase_parameters(self): vase_args = vars(vase_argpars.parse_args()) return vase_args - # Reads the variant list. Assumes that sampleId, chromosome and startpos are columns 1,2 and 3 def read_variant_list(self, variantlistloc): """Reads a file containing genomic variants and returns them in a dictionary. @@ -180,12 +184,18 @@ def read_variant_list(self, variantlistloc): finally: return variant_filter_list - # Reads the config file with the settings def read_config_file(self, configfileloc): """Reads a VaSeBuilder configuration file and returns the parameter values. - :param configfileloc: - :return: + Parameters + ---------- + configfileloc : str + Path to the VaSeBuilder configuration file + + Returns + ------- + configdata : dict + Read parameters and values """ debug_param_vals = ["True", "1", "T"] configdata = {} @@ -214,6 +224,19 @@ def read_config_file(self, configfileloc): # Reads a list of donor fastq files in the format (R1.fq\tR2.fq) def read_donor_fastq_list_file(self, donorfq_listfileloc): + """Reads a file with a list of donor fastq files + + The donor fastq list file is expected to have two columns. The first column should contain the paths to R1 files + and the second column paths to R2 files. For each sample two fastq files are expected, + + Parameters + ---------- + donorfq_listfileloc : str + Path to the donor fastq list file + + Returns + ------- + """ donor_fastqs = [] try: with open(donorfq_listfileloc) as donorfqlistfile: @@ -267,7 +290,7 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): vaseb.run_f_mode(varconfile, paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), paramcheck.get_fastq_out_location()) if "P" in runmode: - vaseb.run_p_mode(varconfile, paramcheck.get_fastq_out_location()) + vaseb.run_p_mode(varconfile, paramcheck.get_out_dir_location(), paramcheck.get_fastq_out_location()) # Run the program. From 251e247b0cc89ec48b8c6884246eb0c63ef4d657 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 11 Sep 2019 16:59:45 +0200 Subject: [PATCH 21/90] Add VaSeBuilder uuid to varcon.txt header --- VaSeBuilder.py | 2 +- VariantContextFile.py | 33 ++++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index eaabb0d..458df55 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -384,7 +384,7 @@ def build_varcon_set(self, sampleidlist, # Write the variant contexts to file. self.vaselogger.info("Writing variant contexts to " f"{varcon_outpath}") - self.contexts.write_variant_context_file(varcon_outpath) + self.contexts.write_variant_context_file(varcon_outpath, self.creation_id) # Write the variant context statistics file. self.vaselogger.info("Writing variant context statistics to " diff --git a/VariantContextFile.py b/VariantContextFile.py index ec3ab18..2f0a6bf 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -778,11 +778,23 @@ def get_median_variant_context_donor_reads(self): return statistics.median([varcon.get_number_of_donor_reads() for varcon in self.variant_contexts.values()]) # ===METHODS TO WRITE VARIANT CONTEXT DATA TO A FILE======================= - # Writes the variant context data to an output file. - def write_variant_context_file(self, outfileloc, samplefilter=None, + def write_variant_context_file(self, outfileloc, vbuuid, samplefilter=None, varconfilter=None, chromfilter=None): + """Writes the data saved in the variant contexts to an output file. + + outfileloc: + Path to write the variant context file to + vbuuid: str + VaSeBuilder unique identifier that build the VariantContextFile + samplefilter : list of str + Sample names/identifiers to include + varconfilter : list of str + Variant contexts to include + chromfilter : list of str + """ try: with open(outfileloc, "w") as varcon_outfile: + varcon_outfile.write(f"#VBUUID: {vbuuid}\n") varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" "Start\tEnd\tAcceptorContextLength\t" "DonorContextLength\tAcceptorReads\t" @@ -1017,7 +1029,6 @@ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc): "reads with unmapped mates to " f"{umfileloc}") - # Writes the unmapped mate id of the acceptor context. def write_acceptor_unmapped_mates(self, umfileloc): """Writes the acceptor context read identifiers that have unmapped mates to an output file. @@ -1039,7 +1050,6 @@ def write_acceptor_unmapped_mates(self, umfileloc): "reads with unmapped mates to " f"{umfileloc}") - # Writes the unmapped mate id of the donor context. def write_donor_unmapped_mates(self, umfileloc): """Writes the donor context read identifiers that have unmapped mates to an output file. @@ -1061,8 +1071,21 @@ def write_donor_unmapped_mates(self, umfileloc): "reads with unmapped mates to " f"{umfileloc}") - # Compares the current VariantContextFile to another variant context file def compare(self, othervarconfile, contextfilter=None): + """Compares the current variant context file to another provided variant context file. + + Parameters + ---------- + othervarconfile : VariantContextFile + VariantContextFile to compare against + contextfilter : list of str or None + Contexts to compare (inclusive filter) + + Returns + ------- + varcondiffs : dict + Differences between the current and other VariantContextFile + """ varcondiffs = {} for contextid in self.variant_contexts: if self.passes_filter(contextid, contextfilter): From b42101ccff2e582b5de37bf1e675a647bbb7b461 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 13 Sep 2019 12:54:34 +0200 Subject: [PATCH 22/90] Add even more docstrings --- DonorBamRead.py | 22 +++- VaSeBuilder.py | 272 +++++++++++++++++++++++++++++++++++++++++++--- VariantContext.py | 18 ++- vase.py | 22 +++- 4 files changed, 307 insertions(+), 27 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 1b2dd5b..bb6008b 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -4,7 +4,27 @@ class DonorBamRead: """The DonorBamRead is used to save data from an aligned read extracted from a BAM or CRAM file. - DonorBamRead can be used to save the data from aligned reads that were extracted from a BAM or CRAM file.""" + DonorBamRead can be used to save the data from aligned reads that were extracted from a BAM or CRAM file. + + Attributes + ---------- + bam_read_id : str + Read identiifer + bam_read_pairnum : str + Read pair number (1 or 2) + bam_read_chrom : str + Chromosome name the read is located on + bam_read_ref_pos : int + Leftmost genomic position of the read + bam_read_length : int + Read length + bam_read_seq : str + Read sequence + bam_read_qual : str + Read ASCII quality + bam_read_map_qual : int + Read MAPQ value + """ def __init__(self, readid, readpn, readchrom, readstart, readlen, readseq, readquals, mapqual): diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 458df55..f7e255a 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -21,12 +21,14 @@ class VaSeBuilder: Attributes ---------- - vb_scanner + vb_scanner : VcfBamScanner + Scans and extracts info from variant and alignment files creation_id : str Unique (uuid) identifier to identify VaSeBuilder runs creation_time : str The date and time of creation to identify VaSeBuilder runs - vaselogger + vaselogger : Logger + VaSeBuilder logger to log VaSeBuilder activity """ # Constructor that saves the identifier, date, and time of the current run. def __init__(self, vaseid): @@ -483,7 +485,6 @@ def build_validation_set(self, run_mode, acceptor_bam, Depending on mode, will either output combined acceptor/donor FastQs, or donor-only FastQs. """ - # Per-variant Fq output functionality. if "P" in run_mode: self.vaselogger.info(f"Begin writing variant FastQ files.") @@ -554,6 +555,20 @@ def passes_filter(self, val_to_check, filter_to_use, is_exclude_filter=False): # Returns a list of VcfVariant objects from a VCF file. A filter can be used to select specific variants. def get_sample_vcf_variants(self, vcf_fileloc, filterlist=None): + """Reads and returns read variants from a variant file. + + Parameters + ---------- + vcf_fileloc : str + Path to variant file to read + filterlist : list of tuple + Variants to include + + Returns + ------- + sample_variant_list : list of VcfVariant + Read variants fro the variant file + """ sample_variant_list = [] try: vcf_file = pysam.VariantFile(vcf_fileloc, "r") @@ -568,8 +583,24 @@ def get_sample_vcf_variants(self, vcf_fileloc, filterlist=None): finally: return sample_variant_list - # Returns whether a variant is a SNP or indel. def determine_variant_type(self, vcfvariantref, vcfvariantalts): + """Determines and returns the variant type. + + The type of variant is determined based on the lengths of the reference and alternative allele(s). Currently + only SNP or indel is returned aas variant type. + + Parameters + ---------- + vcfvariantref : str + Variant reference allele(s) + vcfvariantalts : tuple of str + Variant alternative allele(s) + + Returns + ------- + str + Variant type + """ # Determine the maximum reference allele length. maxreflength = max([len(x) for x in vcfvariantref.split(",")]) # Determine the maximum alternative allele length. @@ -583,8 +614,25 @@ def determine_variant_type(self, vcfvariantref, vcfvariantalts): return "indel" return "?" - # Determines the start and end positions to use for searching reads overlapping with the variant. def determine_read_search_window(self, varianttype, vcfvariant): + """Determines and returns the search window for fetching reads. + + The determined search window depends on the provided variant type. SNPs return a search window of SNP position + -1 and +1. Indels return a search window based indel lefmost position and length. If the variant type is + neither SNP or indel, the search window returned will be [-1, -1]. + + Parameters + ---------- + varianttype : str + Variant type to determine search window for + vcfvariant : VcfVariant + Variant to determine search window with + + Returns + ------- + list of int + Search window start and stop, -1 and -1 if variant type is invalid + """ if varianttype == "snp": return [vcfvariant.get_variant_pos() - 1, vcfvariant.get_variant_pos() + 1] elif varianttype == "indel": @@ -595,6 +643,22 @@ def determine_read_search_window(self, varianttype, vcfvariant): # Returns the search start and stop to use for searching BAM reads overlapping with the range of the indel. def determine_indel_read_range(self, variantpos, variantref, variantalts): + """Determines and returns the search start and stop to use for an indel. + + Parameters + ---------- + variantpos : int + Leftmost genomic position of the variant + variantref : str + Variant reference allele(s) + variantalts : tuple + Variant alternative allele(s) + + Returns + ------- + list of int + Search window start and stop + """ searchstart = variantpos searchstop = variantpos + max( max([len(x) for x in variantref.split(",")]), @@ -607,7 +671,25 @@ def get_variant_reads(self, contextid, variantchrom, variantstart, variantend, bamfile, write_unm=False, umatelist=None): - # Fetch + """Fetches and returns reads + + Parameters + ---------- + contextid : str + Identifier of the context to fetch reads for + variantchrom : str + Chromosome name to fetch reads from + variantstart : int + Leftmost genomic position to use for fetching + variantend : int + bamfile: + write_unm : bool + umatelist : list of str + + Returns + ------- + variantreads : list of DonorBamRead + """ variantreads = [] rpnext = {} for vread in bamfile.fetch(variantchrom, variantstart, variantend): @@ -621,8 +703,29 @@ def get_variant_reads(self, contextid, variantchrom, variantreads = self.uniqify_variant_reads(variantreads) return variantreads - # Fetches the read mates for a set of reads from an BAM/CRAM alignment file. def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): + """Fetches and returns read mates for a set of reads. + + Read mates are fetched from an already opened pysam alignment file using the RNEXT and PNEXT values as well as + the read identifier. + + Parameters + ---------- + rpnextmap : dict + bamfile : pysam.AlignmentFile + Already opened pysam alingment file to fetch read mates from + variantreadlist : list of DonorBamRead + Reads to fetch read mates for + write_unm : bool + Write identifiers of reads with unmapped mates + umatelist : list of str + Identifiers with reads that have an unmapped mate + + Returns + ------- + variantreadlist : list of DonorBamRead + Updated list of reads and the added read mates + """ for read1 in rpnextmap.values(): materead = self.fetch_mate_read(*read1, bamfile) if materead is not None: @@ -644,8 +747,28 @@ def uniqify_variant_reads(self, variantreads): checklist.append(id_pair) return unique_variantreads - # Returns the mate read using the fetch() method def fetch_mate_read(self, rnext, pnext, readid, bamfile): + """Fetches and returns the mate read of a specified read. + + The mate read is fetched from an opened alignment file using the RNEXT and PNEXT value of the read. Of the + fetched reads, only the read with the same identifier is returned. + + Parameters + ---------- + rnext : str + Chromosome name the mate read is located on + pnext : int + Leftmost genomic position of the mate read + readid : str + Identifier of the read to search the mate for + bamfile : pysam.AlignmentFile + Already openend pysam alignment file + + Returns + ------- + DonorBamRead or None + The mate read if found, None if not + """ for bamread in bamfile.fetch(rnext, pnext, pnext + 1): if bamread.query_name == readid and bamread.reference_start == pnext: return DonorBamRead(bamread.query_name, self.get_read_pair_num(bamread), bamread.reference_name, @@ -665,9 +788,27 @@ def filter_variant_reads(self, bamreads): def read_occurence(self, readid, readlist): return sum([bamread.get_bam_read_id() == readid for bamread in readlist]) - # Determines the start and stops of the variant context (please see - # the documentation for more information). def determine_context(self, contextreads, contextorigin, contextchr): + """Determines and returns an acceptor/donor context. + + The acceptor/donor context is determined from a set of reads overlapping with the variant including their read + mates. The leftmost and rightmost genomic positions of all reads are collected and filtered for outliers. + The context start (leftmost genomic) position is then determined by taking minimum and maximum leftmost and + rightmost position respectively. + + Parameters + ---------- + contextreads: list of DonorBamRead + contextorigin : int + Variant genomic position the context will be based on + contextchr : str + Chromosomae name the context is located on + + Returns + ------- + list of str and int + Essential context data (chromosome, variant pos, start, end) + """ # Check whether there are reads to determine the context for. if not contextreads: return [] @@ -708,9 +849,21 @@ def determine_context(self, contextreads, contextorigin, contextchr): + str(contextend)) return [contextchr, contextorigin, contextstart, contextend] - # Filters outliers from a list of start/stop positions using - # Tukey's Fences method. def filter_outliers(self, pos_list, k=3): + """Filters outliers from a list of start or stop positions and returns the filtered list. + + Outlier start/stop positions are filtered from the list using Tukey's Fences method. For more info please see + https://en.wikipedia.org/wiki/Outlier#Tukey's_fences + + Parameters + ---------- + pos_list : list of int + Start/stop positions + k : int + Factor to determine outlier + Returns + ------- + """ # First and third quartile values of the positions. q1 = np.percentile(pos_list, 25) q3 = np.percentile(pos_list, 75) @@ -833,14 +986,45 @@ def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath): fqgz_outfile.flush() fqgz_outfile.close() - # Checks if a read is read 1 (R1) or read 2 (R2). def is_required_read(self, bamread, fr): + """Checks and returns whether the current read is the one that is required. + + When writing the validation fastq files only R1, or forward (F), reads should go to the R1 validation fastq + file. Similarly, the R2 reads should only go into the R2 validation fastq file. + + Parameters + ---------- + bamread : DonorBamRead + Read to check + fr : str + Whether the read is required for R1 or R2 + + Returns + ------- + bool + True if the read is required, False if not + """ if fr == "F": return bamread.is_read1() return bamread.is_read2() - # Returns the name for the fastq out file. def set_fastq_out_path(self, outpath, fr, lnum): + """ + + Parameters + ---------- + outpath : str + Path and suffix to write fastq file to + fr: str + Will the fastq file be R1 ('1') or R2 ('2') + lnum: int + Lane number (i.e.: 1, 2, 3 or 4) + + Returns + ------- + str + Full path to write fastq file to + """ if fr == "1": return f"{outpath}_{datetime.now().date()}_L{lnum}_R1.fastq" return f"{outpath}_{datetime.now().date()}_L{lnum}_R2.fastq" @@ -907,9 +1091,19 @@ def get_searchchrom_prefix(self, bamfile): return "" # ===METHODS TO WRITE OUTPUT FILES========================================= - # Writes the used donor vcf files to a file def write_used_donor_files(self, outfileloc, filesamplemap, used_donor_files): + """Writes the donor alignment or variant files used in constructing variant contexts to an output file. + + Parameters + ---------- + outfileloc : str + Path to write the output file to + filesamplemap : dict + Donor files per sample + used_donor_files : list + Donor alignment or variant files used to construct varian contexts + """ try: with open(outfileloc, "w") as outfile: outfile.write("#SampleId\tDonorFile\n") @@ -1024,14 +1218,36 @@ def write_bed_file(self, variantcontextdata, bedoutloc): # Checks that thet sequence names are the same def check_sequence_names(self, referencefile, alignmentfile): + """Checks and returns whether the chromosome names in the genome reference and alignment file are the same. + + Parameters + ---------- + referencefile: + alignmentfile: + + Returns + ------- + + """ reference_seqnames = self.get_reference_sequence_names(referencefile) alignment_seqnames = self.vb_scanner.get_alignment_sequence_names(alignmentfile) shared_seqnames = reference_seqnames & alignment_seqnames if len(shared_seqnames) < len(reference_seqnames) or len(shared_seqnames) < len(alignment_seqnames): self.vaselogger.warning("Reference file and alignment file do not contain the same sequence names") - # Returns the sequence names from the reference genome fasta file def get_reference_sequence_names(self, reference_fileloc): + """Returns the sequence names from the reference genome fasta file + + Parameters + ---------- + reference_fileloc: str + Path to genomic reference fasta file + + Returns + ------- + reference_seqnames : list of str + Extracted chromosome names + """ reference_seqnames = set() try: with open(reference_fileloc, "r") as reference_file: @@ -1044,8 +1260,28 @@ def get_reference_sequence_names(self, reference_fileloc): exit() return reference_seqnames - # Adds the donor fastq file while making sure each read is only added once def add_donor_fastq3(self, opened_outfile, donorfastqfile, donoridlist): + """Adds a donor fastq file to an already opened validation fastq file. + + Reads from a specified donor fastq file are added to an opened validation fastq file if the reads had not been + added to a validation fastq file. If already added (the read identifier is in the set of donor read ids), the + donor read is skipped. After adding a donor read is written to the validation fastq file, the read identifier is + added to the donor id list, ensuring every read is only added once. + + Parameters + ---------- + opened_outfile : File + Already opened validation fastq file to write data to + donorfastqfile : str + Path to donor fastq file to add + donoridlist : set of str + Set of donor read identifiers already added to the validation fastq files + + Returns + ------- + donoridlist : set of str + Updated set of donor read identifiers already added to validation set + """ try: with io.BufferedReader(gzip.open(donorfastqfile, "rb")) as donorfastq: for fileline in donorfastq: @@ -1521,7 +1757,7 @@ def write_pmode_linkfile(self, outpath, context_fqs_links): plinkloc = f"{outpath}plink_{self.creation_id}.txt" try: with open(plinkloc, "w") as plinkfile: - plinkfile.write(f"# VBUUID: {self.creation_id}") + plinkfile.write(f"#VBUUID: {self.creation_id}") for contextid, fqfiles in context_fqs_links.items(): plinkfile.write(f"{contextid}\t{fqfiles[0]}\t{fqfiles[1]}\n") except IOError: diff --git a/VariantContext.py b/VariantContext.py index 0083918..443c9a1 100644 --- a/VariantContext.py +++ b/VariantContext.py @@ -151,7 +151,6 @@ def get_acceptor_reads(self): """ return self.variant_context_areads - # Returns the variant context donor reads. def get_donor_reads(self): """Returns the variant context donor reads. @@ -182,7 +181,6 @@ def get_acceptor_context(self): """ return self.variant_acceptor_context - # Returns the donor context (the context from donor reads overlapping the variant itself). def get_donor_context(self): """Returns the donor context used to construct the variant context. @@ -193,12 +191,25 @@ def get_donor_context(self): """ return self.variant_donor_context - # Returns a list of acceptor reads overlapping with the variant context. def get_unmapped_acceptor_mate_ids(self): + """Returns the identifiers of variant context acceptor reads that have an unmapped mate. + + Returns + ------- + self.unmapped_acceptor_mate_ids : list of str + Variant context acceptor read identifiers with unmapped mates + """ return self.unmapped_acceptor_mate_ids # Returns a list of donor reads overlapping with the variant context. def get_unmapped_donor_mate_ids(self): + """Returns the identifiers of variant context donor reads with an unmapped mate. + + Returns + ------- + self.unmapped_donor_mate_ids : list of str + Variant contexct donor read identifiers with an unmapped mate + """ return self.unmapped_donor_mate_ids # ===METHODS TO GET DATA (REQUIRING SOME CALCULATING) OF THE VARIANT CONTEXT=============== @@ -1089,7 +1100,6 @@ def to_string(self): + str(list_areads) + "\t" + str(list_dreads)) - # Returns a varconstats.txt string representation of the variant context. def to_statistics_string(self): """Returns a String with basic variant context statistics. diff --git a/vase.py b/vase.py index 7509bbf..049ecce 100644 --- a/vase.py +++ b/vase.py @@ -16,13 +16,18 @@ class VaSe: - # Performs the check that VaSe is run with Python 3.x def __init__(self): """Checks both the python and pysam version. If the python version is not at least 3.6 or higher the program will not run as f-strings, that are used in this program were not available in older versions. The program also requires pysam 0.15 or higher as some pysma - functions used in VaseBuilder are only available since pysam 0.15.""" + functions used in VaseBuilder are only available since pysam 0.15. + + Attributes + ---------- + valid_runmodes : list of str + Valid VaSeBuilder run modes + """ assert (sys.version_info[0] >= 3 and sys.version_info[1] >= 6), "Please run this program in Python 3.6 or " \ "higher" assert (int(pysam.version.__version__.split(".")[0]) >= 0 and int(pysam.version.__version__.split(".")[1]) >= @@ -31,6 +36,8 @@ def __init__(self): # Runs the program. def main(self): + """Runs VaSeBuilder and performs all the work. + """ # Parse the command line parameters and check their validity. vase_arg_list = self.get_vase_parameters() pmc = ParamChecker() @@ -77,9 +84,16 @@ def main(self): ) self.vaselogger.info(f"Elapsed time: {elapsed}.") - # Method that creates the logger that will write the log to stdout and a log file. def start_logger(self, paramcheck, logloc, debug_mode=False): - """Starts and returns the logger VaSe_Logger""" + """Starts and returns the logger VaSe_Logger. + + The logger writes both to stdout and the specified logfile. + + Returns + ------- + vaselogger : Logger + Logging utility to log VaSeBuilder activity + """ vaselogger = logging.getLogger("VaSe_Logger") if debug_mode: vaselogger.setLevel(logging.DEBUG) From 5eb697e4d70a2c455c410131d2e421d7fb96fa69 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 13 Sep 2019 14:15:11 +0200 Subject: [PATCH 23/90] Add docstrings, add donor read id shuffling method ot VaSeBuilder.py --- VaSeBuilder.py | 69 ++++++++++++++++++++++++++++++++++---- VariantContextFile.py | 78 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 125 insertions(+), 22 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index f7e255a..fbe01e5 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -6,6 +6,7 @@ import numpy as np import pysam import time +import random # Import VaSe specific classes. from VcfBamScanner import VcfBamScanner @@ -666,12 +667,14 @@ def determine_indel_read_range(self, variantpos, variantref, variantalts): ) return [searchstart, searchstop] - # Returns the BAM reads containing the specific vcf variant as well as their read mate. def get_variant_reads(self, contextid, variantchrom, variantstart, variantend, bamfile, write_unm=False, umatelist=None): - """Fetches and returns reads + """Fetches and returns reads overlapping with a specified variant. + + First reads overlapping directly with the variant position are fetched. Then the read mates are fetched using + the RNEXT and PNEXT values of each read. Lastly, it is ensured that each read only occurs once. Parameters ---------- @@ -682,13 +685,18 @@ def get_variant_reads(self, contextid, variantchrom, variantstart : int Leftmost genomic position to use for fetching variantend : int - bamfile: + Rightmost genomic position to use for fetching + bamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile write_unm : bool + Save read identifiers with an unmapped mate umatelist : list of str + Identifiers of reads with an unmapped mate Returns ------- variantreads : list of DonorBamRead + Fetched reads and their read mates """ variantreads = [] rpnext = {} @@ -736,8 +744,19 @@ def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umat umatelist.append(read1[2]) return variantreadlist - # Uniqifies a list of variant reads to ensure each read only occurs once def uniqify_variant_reads(self, variantreads): + """Ensures each read only occurs once and returns the updates set. + + Parameters + ---------- + variantreads : list of DonorBamRead + Sets of reads to process + + Returns + ------- + unique_variantreads : list of DonorBamRead + Set of reads with each read occuring only once + """ unique_variantreads = [] checklist = [] for fetched in variantreads: @@ -783,9 +802,21 @@ def filter_variant_reads(self, bamreads): return [bread for bread in bamreads if (self.read_occurence(bread.get_bam_read_id(), bamreads) == 2)] - # Returns the number of occurences of a certain read in the list of - # BAM reads (should be two ideally). def read_occurence(self, readid, readlist): + """Checks and returns the occurence of a specified read. + + Parameters + ---------- + readid : str + Identifier of the read to check + readlist : list of DonorBamRead + Set of reads to check occurence of read in + + Returns + ------- + int + Occurences of the specified read + """ return sum([bamread.get_bam_read_id() == readid for bamread in readlist]) def determine_context(self, contextreads, contextorigin, contextchr): @@ -1762,3 +1793,29 @@ def write_pmode_linkfile(self, outpath, context_fqs_links): plinkfile.write(f"{contextid}\t{fqfiles[0]}\t{fqfiles[1]}\n") except IOError: self.vaselogger.warning(f"Could not write P-mode link file {plinkloc} for run {self.creation_id}") + + def shuffle_donor_reads(self, donorreads, s=2): + """Shuffles and returns a list of donor read identifiers. + + Prior to shuffling the donor read identifiers, the provided donor read list is copied to preserve the original. + The new list is then sorted and a seed is set to ensure that the shuffling can be reproduced if hte same data + is used. + + Parameters + ---------- + donorreads : + Donor read identifiers to shuffle + s: int + Seed to set for shuffling + + Returns + ------- + shuffled_donor_reads : list of str + Shuffled donor read identifiers + """ + shuffled_donor_reads = donorreads.copy() + shuffled_donor_reads.sort() + + random.seed(s) + random.shuffle(shuffled_donor_reads) + return shuffled_donor_reads diff --git a/VariantContextFile.py b/VariantContextFile.py index 2f0a6bf..389958a 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -61,7 +61,6 @@ def get_variant_contexts(self, asdict=False): return self.variant_contexts return [varcon for varcon in self.variant_contexts.values()] - # Returns the variant contexts by sample identifier def get_variant_contexts_by_sampleid(self): """Gathers variant contexts, sorts them by sample identifiers and returns them. @@ -74,7 +73,6 @@ def get_variant_contexts_by_sampleid(self): return {x.get_variant_context_sample(): [y for y in varcons if y.get_variant_context_sample() == x.get_variant_context_sample()] for x in varcons} - # Returns the number of contexts saved def get_number_of_contexts(self): """Determines and returns the number of variant contexts. @@ -85,7 +83,6 @@ def get_number_of_contexts(self): """ return len(self.variant_contexts) - # Returns a list of context ids def get_variant_context_ids(self): """Returns the variant context identifiers. @@ -96,25 +93,38 @@ def get_variant_context_ids(self): """ return [x for x in self.variant_contexts.keys()] - # Returns a specified variant context. def get_variant_context(self, contextid): - """Checks whether a variant context exists and returns it if so.""" + """Checks whether a variant context exists and returns it if so. + + Parameters + ---------- + contextid : str + Identifier of context to obtain + + Returns + ------- + VariantContext or None + VariantContext if it exists, None if not + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid] return None - # Returns whether a variant context is present def has_variant_context(self, contextid): """Checks and returns whether a specified variant context has a variant context. Parameters ---------- contextid : str - Context identifier + Context identifier to check + + Returns + ------- + bool + True if the variant context is present, False if not """ return contextid in self.variant_contexts - # Returns the acceptor context of the specified variant context. def get_acceptor_context(self, contextid): """Fetches and returns a specified acceptor context. @@ -126,13 +136,12 @@ def get_acceptor_context(self, contextid): Returns ------- OverlapContext or None - The acceptor context is exists, None if not + The acceptor context if it exists, None if not """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_context() return None - # Returns the donor context of the specified variant context. def get_donor_context(self, contextid): """Fetches and returns a specified donor context. @@ -140,16 +149,16 @@ def get_donor_context(self, contextid): ---------- contextid : str Context identifier of donor context to return + Returns ------- OverlapContext or None - The donor context if exists, None if not + The donor context if it exists, None if not """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_context() return None - # Returns all variant context acceptor reads. def get_all_variant_context_acceptor_reads(self): """Gathers and returns the acceptor reads of all variant contexts @@ -235,9 +244,25 @@ def get_donor_context_reads(self, contextid): return [] # ===BASIC VARIANTCONTEXTFILE METHODS====================================== - # Reads a provided variant context file and saves data according to set filters. def read_variant_context_file(self, fileloc, samplefilter=None, - IDfilter=None, chromfilter=None): + idfilter=None, chromfilter=None): + """Reads a provided variant context file and saves the data. + + Filter can be set for reading the variant context file. The sample filter can nbe used to specify which samples + to save. Samples not in the samplefilter will be skipped. Similarly filter for variant contexts and chromosome + names can be set. + + Parameters + ---------- + fileloc : str + Path to variant context file to read + samplefilter : list of str + Sample names/identifiers to include + idfilter : list of str + Variant contexts to include + chromfilter : list of str + Chromosome names to include + """ try: with open(fileloc, "r") as vcfile: varcon_records = vcfile.readlines() @@ -251,7 +276,7 @@ def read_variant_context_file(self, fileloc, samplefilter=None, if record[0] in self.variant_contexts: continue - IDpass = self.passes_filter(record[0], IDfilter) + IDpass = self.passes_filter(record[0], idfilter) samplepass = self.passes_filter(record[1], samplefilter) chrompass = self.passes_filter(record[2], chromfilter) if not (IDpass and samplepass and chrompass): @@ -262,8 +287,20 @@ def read_variant_context_file(self, fileloc, samplefilter=None, new_varcon = VariantContext(*record[:6], acceptor_reads, donor_reads) self.variant_contexts[record[0]] = new_varcon - # Reads an acceptor context file def read_acceptor_context_file(self, accconfileloc, samplefilter=None, contextfilter=None, chromfilter=None): + """Reads a provided acceptor context file and adds the acceptor contexts to already existing variant contexts. + + Parameters + ---------- + accconfileloc : str + Path to the acceptor contexts file + samplefilter : list of str + Sample names/identifiers to include + contextfilter : list of str + Acceptor contexts to include + chromfilter : list of str + Chromosome names to include + """ try: with open(accconfileloc, "r") as accconfile: next(accconfile) # Skip the header line @@ -288,6 +325,15 @@ def read_acceptor_context_file(self, accconfileloc, samplefilter=None, contextfi # Reads a donor context file def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilter=None, chromfilter=None): + """ + + Parameters + ---------- + donconfileloc : str + samplefilter : list of str + contextfilter : list of str + chromfilter : list of str + """ try: with open(donconfileloc, "r") as donconfile: next(donconfile) # Skip the header line From 1376d65125b4d3a56c6ac4251a1d2f8d08c0a7d7 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 13 Sep 2019 14:44:44 +0200 Subject: [PATCH 24/90] Small fix for creating P-mode link file --- VaSeBuilder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index fbe01e5..c457e1d 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1548,7 +1548,7 @@ def run_p_mode(self, variantcontextfile, outpath, fq_out): self.vaselogger.debug(f"Writing variant FastQs for variant {context.context_id}.") r1_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "1", 1) - r2_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "1", 2) + r2_donorfq = self.set_fastq_out_path(fq_out + context.context_id, "2", 1) self.build_donor_fq(add_list, "1", r1_donorfq) self.build_donor_fq(add_list, "2", r2_donorfq) context_fq_links[context.context_id] = [r1_donorfq, r2_donorfq] @@ -1788,7 +1788,7 @@ def write_pmode_linkfile(self, outpath, context_fqs_links): plinkloc = f"{outpath}plink_{self.creation_id}.txt" try: with open(plinkloc, "w") as plinkfile: - plinkfile.write(f"#VBUUID: {self.creation_id}") + plinkfile.write(f"#VBUUID: {self.creation_id}\n") for contextid, fqfiles in context_fqs_links.items(): plinkfile.write(f"{contextid}\t{fqfiles[0]}\t{fqfiles[1]}\n") except IOError: From ff10672dab9fb0d3ac72ae7e981297152b4ce0a3 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 16 Sep 2019 16:51:35 +0200 Subject: [PATCH 25/90] Add more docstrings to methods --- DonorBamRead.py | 30 +++- OverlapContext.py | 25 ++- ParamChecker.py | 19 +- VaSeBuilder.py | 34 +++- VariantContextFile.py | 406 +++++++++++++++++++++++++++++++++++------- 5 files changed, 440 insertions(+), 74 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index bb6008b..07db92e 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -9,7 +9,9 @@ class DonorBamRead: Attributes ---------- bam_read_id : str - Read identiifer + Read identifier + bam_read_flag : int + The read bitwise flag value bam_read_pairnum : str Read pair number (1 or 2) bam_read_chrom : str @@ -18,6 +20,14 @@ class DonorBamRead: Leftmost genomic position of the read bam_read_length : int Read length + bam_read_cigar : str + Read CIGAR string + bam_read_rnext : str + Chromosome name of the read mate + bam_read_pnext : int + Leftmost genomic position of the read mate + bam_read_tlen : int + Read TLEN value bam_read_seq : str Read sequence bam_read_qual : str @@ -50,10 +60,15 @@ def __init__(self, readid, readpn, readchrom, readstart, The MAPQ mapping quality """ self.bam_read_id = readid + self.bam_read_flag = "" self.bam_read_pairnum = readpn self.bam_read_chrom = readchrom self.bam_read_ref_pos = readstart self.bam_read_length = readlen + self.bam_read_cigar = "" + self.bam_read_rnext = "" + self.bam_read_pnext = "" + self.bam_read_tlen = 0 self.bam_read_seq = readseq self.bam_read_qual = readquals self.bam_read_map_qual = mapqual @@ -250,3 +265,16 @@ def get_as_fastq_seq(self, addpairnum=False): + str(self.bam_read_seq) + "\n" + "+\n" + str(self.bam_read_qual) + "\n") + + def get_as_bam_read(self): + """Returns the read as a SAM/BAM/CRAM file entry. + + Returns + ------- + bamcram_entry : str + Read as SAM/BAM/CRAM entry + """ + bamcram_entry = f"{self.bam_read_id}\t{self.bam_read_flag}\t{self.bam_read_chrom}\t{self.bam_read_ref_pos}" \ + f"\t{self.bam_read_map_qual}t{self.bam_read_cigar}\t{self.bam_read_rnext}\t{self.bam_read_pnext}" \ + f"\t{self.bam_read_tlen}\t{self.bam_read_seq}\t{self.bam_read_qual}" + return bamcram_entry diff --git a/OverlapContext.py b/OverlapContext.py index 284ce37..fe3f102 100644 --- a/OverlapContext.py +++ b/OverlapContext.py @@ -26,9 +26,28 @@ class OverlapContext: unmapped_read_mate_ids : list of str List of read identifiers that have unmapped mates """ - # Saves the data associated with the context overlapping with the variant. + def __init__(self, variantid, sampleid, ovconchrom, ovconorigin, ovconstart, ovconend, bamreads): + """Saves the required data for an acceptor or donor context. + + Parameters + ---------- + variantid : str + Variant context identifier + sampleid : str + Sample name/identifier + ovconchrom : str + Chromosome name the context is located on + ovconorigin : int + Variant genomic position the context is constructed from + ovconstart : int + Leftmost genomic position of the context + ovconend : int + Rightmost genomic position of the context + bamreads : list of DonorBamRead + Reads and read mates overlapping with the context + """ self.context_id = variantid self.sample_id = sampleid self.context_chrom = ovconchrom @@ -92,7 +111,6 @@ def get_context_origin(self): """ return self.context_origin - # Returns the context start position. def get_context_start(self): """Returns the left most genomic position of the context. @@ -103,7 +121,6 @@ def get_context_start(self): """ return self.context_start - # Returns the end position of the context. def get_context_end(self): """Returns the right most genomic position of the context. @@ -114,7 +131,6 @@ def get_context_end(self): """ return self.context_end - # Returns the bam reads associated with the context as a list of BamRead objects. def get_context_bam_reads(self): """Returns the list of reads and their mates overlapping with the context. @@ -122,7 +138,6 @@ def get_context_bam_reads(self): as a DonorBamRead object.""" return self.context_bam_reads - # Returns the list of BAM read ids that have an unmapped mate. def get_unmapped_read_mate_ids(self): """Returns a list with identifiers of reads that have an unmapped mate. diff --git a/ParamChecker.py b/ParamChecker.py index f8b86a7..f26e2de 100644 --- a/ParamChecker.py +++ b/ParamChecker.py @@ -8,7 +8,8 @@ class ParamChecker: Attributes ---------- - vaselogger + vaselogger : Logger + Logs VaSeBuilder activity vcf_filelist : str Path to the variant list file bam_filelist : str @@ -184,8 +185,20 @@ def check_folders_exist(self, paramvals, file_exts): existing_folders.append(foldername) return existing_folders - # Checks whether at least one file with a provided extension (.vcf or .bam) is present. def check_folder_contents(self, folder_to_check, file_exts): + """Counts and returns the number of files with a specified extension in a specified folder. + + Parameters + ---------- + folder_to_check: + Folder to check for files with + file_exts: + File extension to check files on + Returns + ------- + vb_count : int + Number of specified files in folder + """ vb_count = 0 for vbfile in os.listdir(folder_to_check): if vbfile.endswith(file_exts): @@ -194,7 +207,6 @@ def check_folder_contents(self, folder_to_check, file_exts): f"{file_exts[0][1:4].upper()} files") return vb_count - # Checks whether a provided file exists. def check_file_exists(self, fileloc): """Checks and returns whether one or more files exist. @@ -372,7 +384,6 @@ def check_required_runmode_parameters(self, runmode, vaseargvals): return self.check_parameters(filtered_vaseargvals) return False - # Returns the name of the folder name of a parameter value (if the parameter value is ). def get_folder_name(self, foldername): """Returns the name of the folder for a given path. diff --git a/VaSeBuilder.py b/VaSeBuilder.py index c457e1d..5858b84 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1145,8 +1145,20 @@ def write_used_donor_files(self, outfileloc, filesamplemap, self.vaselogger.critical("Could not write used donor files to " f"{outfileloc}") - # Writes optional debug level output files. def write_optional_output_files(self, outpath, contextfile): + """Writes optional output files to a specified output location. + + The optional output files contain data about acceptor and donor contexts as well as output files containing the + reads with unmapped mates per variant context. The optional output files are only produced when VaSeBuilder is + run with with debug set to True. + + Parameters + ---------- + outpath: str + Path to write the optional output file to + contextfile: VariantContextFile + Variant context data + """ # Write the optional acceptor context files; acceptor contexts, # read ids with unmapped mate and left/right positions. self.vaselogger.debug("Writing acceptor contexts to " @@ -1384,6 +1396,24 @@ def divide_donorfastqs_over_acceptors(self, list_of_donorfqs, num_of_acceptors): # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING DONOR FASTQS def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toexclude, forward_reverse, outpath): + """Builds a set of R1/R2 validatioin fastq files using already existing donor fastq files. + + The provided acceptor fastq files are filtered and used as template for the valdiation fastq files. Already + existing fastq files are added. + + Parameters + ---------- + acceptor_fqin: str + Path to acceptor fastq file to use as template + donor_fqin: list of str + Paths to donor fastq files to add + acceptor_reads_toexclude: liust of str + Acceptor reads to exclude from validation set + forward_reverse: str + Write R1 or R2 + outpath: str + Path to write the validation fastq file to + """ donor_sets = self.divide_donorfastqs_over_acceptors(donor_fqin, len(acceptor_fqin)) donorids = set() @@ -1778,6 +1808,8 @@ def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, v def write_pmode_linkfile(self, outpath, context_fqs_links): """Writes the link from variant context identifier to fastq output files for P-mode. + This output file is meant to link a VaSeBuilder run with fastq output files. + Parameters ---------- outpath : str diff --git a/VariantContextFile.py b/VariantContextFile.py index 389958a..d1eda0e 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -17,7 +17,7 @@ class VariantContextFile: Saves variant contexts by context identifier variant_context_statistics varcon_fields : dict - Fields + Number representation of each variant context data field """ def __init__(self, fileloc=None, samplefilter=None, @@ -188,6 +188,13 @@ def get_all_variant_context_acceptor_reads(self): # return uniqdonorreads # ============================================================================= def get_all_variant_context_donor_reads(self): + """Collects and returns all variant context donor reads. + + Returns + ------- + list of DonorBamRead + Variant context donor reads + """ dbrs = [] donorreads = [] for varcon in self.variant_contexts.values(): @@ -199,45 +206,108 @@ def get_all_variant_context_donor_reads(self): dbr.get_bam_read_qual())) return list(set(donorreads)) - # Returns all variant context acceptor read ids. def get_all_variant_context_acceptor_read_ids(self): + """Returns all variant context acceptor read identifiers. + + Returns + ------- + acceptorreadids : list of str + Variant context acceptor read identifiers + """ acceptorreadids = [] for varcon in self.variant_contexts.values(): acceptorreadids.extend(varcon.get_acceptor_read_ids()) return acceptorreadids - # Returns the variant context donor read ids. def get_all_variant_context_donor_read_ids(self): + """Returns all variant context donor read identifiers. + + Returns + ------- + donorreadids : list of str + Variant context donor read identifiers + """ donorreadids = [] for varcon in self.variant_contexts.values(): donorreadids.extend(varcon.get_donor_read_ids()) return donorreadids - # Returns the variant context field data def get_variant_context_fields(self): + """Returns the number representations of the variant context data. + + Returns + ------- + self.varcon_fields : dict + Number representation of each variant context field + """ return self.varcon_fields # ===METHODS TO OBTAIN VARIANT CONTEXT DATA============================= def get_variant_context_areads(self, contextid): + """Returns the variant context acceptor reads for a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier + + Returns + ------- + list of DonorBamRead + Variant context acceptor reads, empty list of context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_reads() return [] - # Returns the donor reads of a specified variant context def get_variant_context_dreads(self, contextid): + """Returns the variant context donor reads for a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier + + Returns + ------- + list of DonorBamRead + Variant context donor reads, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_reads() return [] - # Returns the acceptor context reads def get_acceptor_context_reads(self, contextid): + """Returns the reads of a specified acceptor context. + + Parameters + ---------- + contextid : str + Acceptor context identifier + + Returns + ------- + list of DonorBamRead + Acceptor context reads + """ if contextid in self.variant_contexts: if self.variant_contexts[contextid].has_acceptor_context(): return self.variant_contexts[contextid].get_acceptor_context_reads() return [] - # Returns the reads of the donor context def get_donor_context_reads(self, contextid): + """Returns the reads of a specified donor context. + + Parameters + ---------- + contextid : str + Donor context identifier + + Returns + ------- + list of DonorBamRead + Donor context reads, empty list if context does not exist + """ if contextid in self.variant_contexts: if self.variant_contexts[contextid].has_donor_context(): return self.variant_contexts[contextid].get_donor_context_reads() @@ -325,14 +395,18 @@ def read_acceptor_context_file(self, accconfileloc, samplefilter=None, contextfi # Reads a donor context file def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilter=None, chromfilter=None): - """ + """Reads a provided donor contexts file and adds the donor contexts to already existing variant contexts. Parameters ---------- donconfileloc : str + Path to the donr contexts file samplefilter : list of str + Sample names/identifiers to include contextfilter : list of str + Donor contexts to include chromfilter : list of str + Chromosome names to include """ try: with open(donconfileloc, "r") as donconfile: @@ -356,7 +430,6 @@ def read_donor_context_file(self, donconfileloc, samplefilter=None, contextfilte except IOError: self.vaselogger.warning(f"Could not read donor context file: {donconfileloc}") - # Returns whether something is in the filter or not. def passes_filter(self, valtocheck, filterlist): """Tests and returns whether a provided value is in a provided filter. @@ -379,9 +452,8 @@ def passes_filter(self, valtocheck, filterlist): return True # ===VARIANT CONTEXT SET OPERATIONS (UNION, INTERSECT, DIFFERENCE)========= - # Returns the list of variant context identifiers from both variant context files. def get_variant_contexts_union(self, other_varcon_file): - """Returns all variant context identifiers in both VariantContextFiles. + """Returns all variant context identifiers from both variant context files. Parameters ---------- @@ -397,9 +469,8 @@ def get_variant_contexts_union(self, other_varcon_file): other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) | set(other_varcon_ids)) - # Returns the list of variant context identifiers present in both variant context files. def get_variant_contexts_intersect(self, other_varcon_file): - """ + """Returns variant context identifiers present in both variant context files. Parameters ---------- @@ -415,9 +486,8 @@ def get_variant_contexts_intersect(self, other_varcon_file): other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) & set(other_varcon_ids)) - # Returns the list of variant context identifiers in this file but not present in the other variant context file. def get_variant_contexts_difference(self, other_varcon_file): - """Determines and return the variant context identifiers not present in the other variant context file. + """Returns the variant context identifiers not present in the other variant context file. Parameters ---------- @@ -433,9 +503,8 @@ def get_variant_contexts_difference(self, other_varcon_file): other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) - set(other_varcon_ids)) - # Returns the list of variant context identifiers only present in one of the variant context file but not the other. def get_variant_contexts_symmetric_difference(self, other_varcon_file): - """Determines the variant context identifiers present in either one or the other. + """Determines the variant context identifiers present in either one or the other variant context file. THe normal difference only returns the context identifiers in A but not in B. The symmetric difference returns the context identifiers in A but not in B as well as context identifiers in B but not in A. @@ -453,11 +522,13 @@ def get_variant_contexts_symmetric_difference(self, other_varcon_file): other_varcon_ids = other_varcon_file.get_variant_context_ids() return list(set(own_varcon_ids) ^ set(other_varcon_ids)) - # ===METHODS TO OBTAIN VARIANT CONTEXT DATA BASED ON FILTERS=============== - # Returns a list/hashmap of VariantContextObjects. + # ===METHODS TO OBTAIN VARIANT CONTEXT DATA BASED ON FILTERS===============. def get_variant_contexts2(self, aslist=False, varconfilter=None, samplefilter=None, chromfilter=None): - """ + """Returns the variant contexts in the VariantContextFile. + + Inclusion filters can be set to subset the returned variant contexts. FIlters include sample names/identifiers, + variant context identifiers and chromosome names. Variant contexts can be returned as list or dictionary. Parameters ---------- @@ -471,7 +542,7 @@ def get_variant_contexts2(self, aslist=False, varconfilter=None, Returns ------- - list or dict + list or dict of VariantContext Returns variant contexts as list if aslist set to True, dict otherwise """ if aslist: @@ -494,10 +565,9 @@ def get_variant_contexts2(self, aslist=False, varconfilter=None, } # ====METHODS TO ASSESS WHETHER A VARIANT IS IN AN EXISTING CONTEXT======== - # Main method that returns whether a variant (SNP or indel). def variant_is_in_context(self, varianttype, searchchrom, searchstart, searchstop): - """Checks whether a variant is located in an already existing context. + """Returns whether a variant is located in an already existing context. If the variant type is set to 'snp' the method will check and return whether the SNP overlaps with a variant context. If the variant is set to 'indel' the method will check and return wheterh the area from start to end @@ -552,8 +622,6 @@ def snp_variant_is_in_context(self, varchrom, vcfvarpos): return True return False - # Determines whether an indel variant is located within an existing variant context (indelLeftPos and indelRightPos - # can be the used search window). def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos): """Checks and returns whether an indel is located in an already existing variant context. @@ -587,7 +655,6 @@ def indel_variant_is_in_context(self, indelchrom, indelleftpos, indelrightpos): return True return False - # Checks whether a context is located in an already existing context def context_collision(self, context_arr): """Checks and returns whether a potential context overlaps with an already existing variant context. @@ -612,7 +679,6 @@ def context_collision(self, context_arr): return False # ===METHODS TO ADD DATA/VARIANT CONTEXTS TO THE VARIANT CONTEXT FILE====== - # Sets a variant context object to a specified context id. def set_variant_context(self, varconid, varcontext): """Sets a provided variant context to the provided context identifier. Will overwrite the previous value for the provided context identifier. @@ -626,13 +692,12 @@ def set_variant_context(self, varconid, varcontext): """ self.variant_contexts[varconid] = varcontext - # Adds a variant context object def add_variant_context(self, varconid, sampleid, varconchrom, varconorigin, varconstart, varconend, varcon_areads, varcon_dreads, acceptor_context=None, donor_context=None): - """ + """Constructs a VariantContext from the provided parameters and adds it. Parameters ---------- @@ -645,11 +710,13 @@ def add_variant_context(self, varconid, sampleid, varconorigin : int The variant position the context is based on varconstart : int - + Leftmost genomic position of the variant context varconend : int - varcon_areads : list of DonorBamReads - varcon_dreads : list of DonorBamReads - + Rightmost genomic position of the variant context + varcon_areads : list of DonorBamRead + Variant context acceptor reads + varcon_dreads : list of DonorBamRead + Variant contect donor reads acceptor_context : OverlapContext Acceptor context belonging to the variant context donor_context : OverlapContext @@ -662,7 +729,6 @@ def add_variant_context(self, varconid, sampleid, acceptor_context, donor_context) self.variant_contexts[varconid] = varcon_obj - # Adds an existing variant context to the variant context file def add_existing_variant_context(self, varconid, varconobj): """Add an already created variant context to the variant context file. @@ -679,7 +745,6 @@ def add_existing_variant_context(self, varconid, varconobj): if varconobj is not None: self.variant_contexts[varconid] = varconobj - # Adds an acceptor context object to a variant context. def set_acceptor_context(self, varconid, acceptor_context): """Adds an existing acceptor context to a variant context. @@ -693,12 +758,11 @@ def set_acceptor_context(self, varconid, acceptor_context): if varconid in self.variant_contexts: self.variant_contexts[varconid].set_acceptor_context(acceptor_context) - # Add a newly created acceptor context to an existing variant context. def add_acceptor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, acceptorreads): - """Creates and adds an acceptor context to a variant context. + """Constructs and adds an acceptor context to a specified variant context. Parameters ---------- @@ -724,7 +788,6 @@ def add_acceptor_context(self, contextid, sampleid, contextstart, contextend, acceptorreads) - # Sets the donor context object of a variant context def set_donor_context(self, varconid, donor_context): """Adds an already existing donor context to a specified variant context. @@ -738,11 +801,29 @@ def set_donor_context(self, varconid, donor_context): if varconid in self.variant_contexts: self.variant_contexts[varconid].set_donor_context(donor_context) - # Add a newly created donor context to an existing variant context. def add_donor_context(self, contextid, sampleid, contextchrom, contextorigin, contextstart, contextend, donorreads): + """Constructs a donor context from the provided parameters and adds it to a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier + sampleid : str + Sample name/identifier + contextchrom : str + Chromosome name the context is located on + contextorigin : int + Variant position that constructed the context + contextstart : int + Donor context leftmost genomic position + contextend : int + Donor context rightmost genomic position + donorreads : list of DonorBamRead + Donor context reads + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].add_donor_context( contextid, sampleid, @@ -751,76 +832,197 @@ def add_donor_context(self, contextid, sampleid, donorreads) # ===METHODS TO ADD UNMAPPED MATE IDS TO ACCEPTOR, DONOR AND VARIANT CONTEXT======= - # Sets the unmapped mate read id for a specified acceptor context. def set_acceptor_context_unmapped_mate_ids(self, contextid, mateids): + """Sets the read identifiers that have unmapped read mates fo a specified acceptor context. + + Parameters + ---------- + contextid : str + Acceptor context to set unmapped read identifiers for + mateids : lit of str + Read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_acceptor_context_unmapped_mates(mateids) - # Sets the unmapped mate read id for a specified donor context. def set_donor_context_unmapped_mate_ids(self, contextid, mateids): + """Sets the read identifiers that have unmapped read mates for a specified donor context. + + Parameters + ---------- + contextid : str + Donor context to set unmapped read identifiers for + mateids : list of str + Donor context read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_donor_context_unmapped_mates(mateids) - # Sets the acceptor unmapped mate ids for a specified variant context. def set_unmapped_acceptor_mate_ids(self, contextid, mateids): + """Sets the acceptor read identifiers that have unmapped mates for a specified variant context. + + Parameters + ---------- + contextid : str + Variant contexct identifier to set unmapped acceptor read identifers for + mateids : list of str + Variant context acceptor read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_unmapped_acceptor_mate_ids(mateids) - # Sets the donor unmapped mate ids for a specified variant context. def set_unmapped_donor_mate_ids(self, contextid, mateids): + """Sets the donor read identifiers that have unmapped mates for a specified variant context. + + Parameters + ---------- + contextid : str + Variant context identifier to set unmapped donor read identifiers for + mateids: list of str + Variant context donor read identifiers with unmapped mates + """ if contextid in self.variant_contexts: self.variant_contexts[contextid].set_unmapped_donor_mate_ids(mateids) # ===METHODS TO GET UNMAPPED MATE IDS OF AN ACCEPTOR, DONOR AND VARIANT CONTEXT============ - # Return the unmapped read mate identifiers of a specified acceptor context. def get_acceptor_context_unmapped_mate_ids(self, contextid): + """Returns the acceptor context read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + Acceptor context identifier + + Returns + ------- + list of str + Acceptor context read identifiers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_acceptor_context_unmapped_mate_ids() return [] - # Returns the unmapped read mate identifiers of a specified donor context. def get_donor_context_unmapped_mate_ids(self, contextid): + """Returns the donor context read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + Donor context identifier + + Returns + ------- + list of str + Donor contexts read identifiers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_donor_context_unmapped_mate_ids() return [] - # Returns the unmapped acceptor read mate identifiers of a specified variant context. def get_unmapped_acceptor_mate_ids(self, contextid): + """Returns the variant context acceptor read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + Variant context identifier + + Returns + ------- + list of str + Variant context acceptor read identiifers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_unmapped_acceptor_read_ids() return [] - # Returns the unmapped donor read mate identifiers of a specified variant context. def get_unmapped_donor_mate_ids(self, contextid): + """Returns the variant context donor read identifiers that have unmapped read mates. + + Parameters + ---------- + contextid : str + + Returns + ------- + list of str + Variant context donor read identifiers, empty list if context does not exist + """ if contextid in self.variant_contexts: return self.variant_contexts[contextid].get_unmapped_donor_read_ids() return [] # ===METHODS TO OBTAIN SOME STATISTICS ABOUT ALL THE CONTEXTS============== - # Returns the average variant context length within this variant context file. def get_average_variant_context_length(self): + """Calculates and returns the mean variant context length. + + The mean length is calculated over all variant contexts in the variant context file. Acceptor and donor contexts + associated with the variant contexts are not used. + + Returns + ------- + float + Average variant context length + """ return statistics.mean([varcon.get_variant_context_length() for varcon in self.variant_contexts.values()]) - # Returns the median variant context length within this variant context file. def get_median_variant_context_length(self): + """Calculates and returns the median variant context length. + + The median length is calculated over all variant contexts in the variant context file. Acceptor and donor + contexts associated with the variant contexts are not used. + + Returns + ------- + float + Median variant context length + """ return statistics.median([varcon.get_variant_context_length() for varcon in self.variant_contexts.values()]) - # Returns the average number of variant context acceptor reads def get_average_variant_context_acceptor_reads(self): + """Calculates and returns the average number of variant context acceptor reads. + + The mean number of acceptor reads is calculated over all variant contexts. + + Returns + ------- + float + Mean number of variant context acceptor reads + """ return statistics.mean([varcon.get_number_of_acceptor_reads() for varcon in self.variant_contexts.values()]) - # Returns the average number of variant context donor reads def get_average_variant_context_donor_reads(self): + """Calculates and returns the average number of variant context donor reads. + + The mean number of donor reads is calculated over all variant contexts. + + Returns + ------- + float + Mean number of variant context donor reads + """ return statistics.mean([varcon.get_number_of_donor_reads() for varcon in self.variant_contexts.values()]) - # Returns the median number of variant context acceptor reads def get_median_variant_context_acceptor_reads(self): + """Calculates and returns the median variant context acceptor reads. + + Returns + ------- + float + Median number of variant context acceptor reads + """ return statistics.median([varcon.get_number_of_acceptor_reads() for varcon in self.variant_contexts.values()]) - # Returns the median number of variant context donor reads def get_median_variant_context_donor_reads(self): + """Calculates and returns the median number of variant context donor reads. + + Returns + ------- + float + Median number of variant context donor reads + """ return statistics.median([varcon.get_number_of_donor_reads() for varcon in self.variant_contexts.values()]) # ===METHODS TO WRITE VARIANT CONTEXT DATA TO A FILE======================= @@ -828,15 +1030,18 @@ def write_variant_context_file(self, outfileloc, vbuuid, samplefilter=None, varconfilter=None, chromfilter=None): """Writes the data saved in the variant contexts to an output file. - outfileloc: + Parameters + ---------- + outfileloc : str Path to write the variant context file to - vbuuid: str + vbuuid : str VaSeBuilder unique identifier that build the VariantContextFile samplefilter : list of str Sample names/identifiers to include varconfilter : list of str Variant contexts to include chromfilter : list of str + Chromosome names to include """ try: with open(outfileloc, "w") as varcon_outfile: @@ -865,9 +1070,21 @@ def write_variant_context_file(self, outfileloc, vbuuid, samplefilter=None, self.vaselogger.warning("Could not write variant contexts to " f"{ioe.filename}") - # Writes the donor contexts used to construct the variant contexts. def write_acceptor_context_file(self, outfileloc, samplefilter=None, contextfilter=None, chromfilter=None): + """Writes the acceptor contexts associated with variants contexts to an output file. + + Parameters + ---------- + outfileloc: str + Path and name to write acceptor contexts to + samplefilter: list of str + Sample names/identifiers to include + contextfilter: list of str + Acceptor contexts to include + chromfilter: list of str + Chromosome names to include + """ try: with open(outfileloc, "w") as varcon_oufile: varcon_oufile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" @@ -893,9 +1110,21 @@ def write_acceptor_context_file(self, outfileloc, samplefilter=None, self.vaselogger.warning("Could not write acceptor contexts to " f"{ioe.filename}") - # Writes the acceptor cotnexts used to construct the variant contexts. def write_donor_context_file(self, outfileloc, samplefilter=None, contextfilter=None, chromfilter=None): + """Writes the donor contexts associated with variant context to an output file. + + Parameters + ---------- + outfileloc : str + Path and name to write the donor contexts to + samplefilter : list of str + Sample names/identifiers to include + contextfilter : list of str + Donor contexts to include + chromfilter : list of str + Chromosome names to include + """ try: with open(outfileloc, "w") as varcon_outfile: varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" @@ -921,8 +1150,16 @@ def write_donor_context_file(self, outfileloc, samplefilter=None, self.vaselogger.warning("Could not write donor contexts to " f"{ioe.filename}") - # Writes some statistics about the acceptor and donor reads identified for each variant context. def write_variant_context_stats(self, statsoutloc): + """Writes basic statistics of the variant contexts to a specified output file. + + Basic statistics include means and medians of read lengths, Q-Score and mapping quality. + + Parameters + ---------- + statsoutloc : str + Path and name to write variant context statistics output file to + """ try: with open(statsoutloc, "w") as varcon_statsfile: varcon_statsfile.write("#ContextId\tAvg_ALen\tAvg_DLen\t" @@ -936,8 +1173,16 @@ def write_variant_context_stats(self, statsoutloc): self.vaselogger.critical("Could not write variant context " f"statistics to {statsoutloc}") - # Writes some statistics about the acceptor and donor reads identified for each variant context. def write_acceptor_context_stats(self, statsoutloc): + """Writes basic statistics of the acceptor contexts to a specified output file. + + Basic statistics include means and medians of read lengths, Q-Score and mapping quality. + + Parameters + ---------- + statsoutloc: str + Path and name to write the acceptor context statistics file to + """ try: with open(statsoutloc, "w") as varcon_statsfile: varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" @@ -954,6 +1199,15 @@ def write_acceptor_context_stats(self, statsoutloc): # Writes some statistics about the acceptor and donor reads identified for each variant context. def write_donor_context_stats(self, statsoutloc): + """Writes basic statistics of the donor contexts to a specified output file. + + Basic statistics include means and medians of read lengths, Q-Score and mapping quality. + + Parameters + ---------- + statsoutloc : str + Path and name to write the donor context statistics to + """ try: with open(statsoutloc, "w") as varcon_statsfile: varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" @@ -968,8 +1222,18 @@ def write_donor_context_stats(self, statsoutloc): self.vaselogger.critical("Coud not write donor context statistics " f"to {statsoutloc}") - # Writes the left and right positions to the output file. Left pos for R1 and right pos for R2. def write_left_right_positions(self, typetowrite, outfileloc): + """Writes variant context leftmost and rightmost genomic read positions to an output file. + + The leftmost genomic positions of R1 reads and rightmost genomic positions for R2 reads are written to file. + + Parameters + ---------- + typetowrite : str + Type (acceptor/donor) of file to write + outfileloc : str + Path and name to write the output file to + """ try: with open(outfileloc, "w") as lrpof: lrpof.write("#ContextId\tLeftPos\tRightPos\n") @@ -1000,8 +1264,16 @@ def write_left_right_positions(self, typetowrite, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - # Writes the acceptor context left and right positions to the output file. Left pos for R1 and right pos for R2. def write_acceptor_left_right_positions(self, outfileloc): + """Writes acceptor context leftmost and rightmost genomic read positions to an output file. + + The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file. + + Parameters + ---------- + outfileloc : str + Path and name to write the output file to + """ try: with open(outfileloc, "w") as lrpof: lrpof.write("#ContextId\tLeftPos\tRightPos\n") @@ -1021,8 +1293,16 @@ def write_acceptor_left_right_positions(self, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - # Writes the left and right positions to the output file. Left pos for R1 and right pos for R2. def write_donor_left_right_positions(self, outfileloc): + """Writes donor context leftmost and rightmost genomic read positions to an output file. + + The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file. + + Parameters + ---------- + outfileloc : str + Path and name to write the output file to + """ try: with open(outfileloc, "w") as lrpof: lrpof.write("#ContextId\tLeftPos\tRightPos\n") @@ -1038,7 +1318,7 @@ def write_donor_left_right_positions(self, outfileloc): lrpof.write(str(varcon.get_variant_context_id()) + "\t" + ",".join(leftpositions) + "\t" + ",".join(rightpositions) + "\n") - except IOError as ioe: + except IOError: self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") From 0d43cc72d280bd0d5ac935461ae097490bfd3385 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 17 Sep 2019 09:39:32 +0200 Subject: [PATCH 26/90] Add getter methods for new attributes --- DonorBamRead.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/DonorBamRead.py b/DonorBamRead.py index 07db92e..32b9937 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -84,6 +84,15 @@ def get_bam_read_id(self): """ return self.bam_read_id + def get_bam_read_flag(self): + """Returns the read alignment flag. + + Returns + ------- + self.bam_read_flag : str + Read alignment flag + """ + def get_bam_read_pair_number(self): """Returns the pair number of the read. This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads. @@ -138,6 +147,45 @@ def get_bam_read_ref_end(self): return self.bam_read_ref_pos + self.bam_read_length return -1 + def get_bam_read_cigar(self): + """Returns the read alignment CIGAR string + + Returns + ------- + self.bam_read_cigar : str + Read alignment CIGAR string + """ + return self.bam_read_cigar + + def get_bam_read_rnext(self): + """Returns the chromosome name the read mate is located on. + + Returns + ------- + self.bam_read_rnext : str + Chromosome name the read mate is located on + """ + return self.bam_read_rnext + + def get_bam_read_pnext(self): + """Returns the leftmost genomic position of the read mate. + + Returns + ------- + self.bam_read_pnext : ibnt + Leftmost genomic position of the read mate + """ + return self.bam_read_pnext + + def get_bam_read_tlen(self): + """Returns the read alignment TLEN value. + + Returns + ------- + self.bam_read_tlen : int + Read alignment TLEN value + """ + def get_bam_read_sequence(self): """Returns the read sequence as a String. From 80925b2f18b350ed059604a25bb15dbbad48f47b Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 17 Sep 2019 16:11:59 +0200 Subject: [PATCH 27/90] Lets DonorBamRead save all required data to create a BAM read representation --- DonorBamRead.py | 26 ++++++++++++++++++-------- VaSeBuilder.py | 45 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 32b9937..72039c9 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -36,14 +36,16 @@ class DonorBamRead: Read MAPQ value """ - def __init__(self, readid, readpn, readchrom, readstart, - readlen, readseq, readquals, mapqual): + def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, readcigar, readrnext, readpnext, + readtlen, readseq, readquals, readmapq): """Saves all required data of an aligned read from a BAM or CRAM file. Parameters ---------- readid : str The read identifier + readflag : str + Aligned read bitwise flag readpn : str The read pair number readchrom : str @@ -52,6 +54,14 @@ def __init__(self, readid, readpn, readchrom, readstart, The leftmost genomic position of the mapped read readlen : int The total length of the read + readcigar : str + Aligned read CIGAR string + readrnext : str + Chromsome name the read mate is located on + readpnext : int + Leftmost genomic position of the read mate + readtlen : int + Aligned read TLEN value readseq : str The read sequence reqdquals : str @@ -60,18 +70,18 @@ def __init__(self, readid, readpn, readchrom, readstart, The MAPQ mapping quality """ self.bam_read_id = readid - self.bam_read_flag = "" + self.bam_read_flag = readflag self.bam_read_pairnum = readpn self.bam_read_chrom = readchrom self.bam_read_ref_pos = readstart self.bam_read_length = readlen - self.bam_read_cigar = "" - self.bam_read_rnext = "" - self.bam_read_pnext = "" - self.bam_read_tlen = 0 + self.bam_read_cigar = readcigar + self.bam_read_rnext = readrnext + self.bam_read_pnext = readpnext + self.bam_read_tlen = readtlen self.bam_read_seq = readseq self.bam_read_qual = readquals - self.bam_read_map_qual = mapqual + self.bam_read_map_qual = readmapq # ===METHODS TO GET SAVED DATA FROM THE DONORBAMREAD======================= def get_bam_read_id(self): diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 5858b84..2aba49c 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -698,15 +698,20 @@ def get_variant_reads(self, contextid, variantchrom, variantreads : list of DonorBamRead Fetched reads and their read mates """ + hardclipped_read_num = 0 variantreads = [] rpnext = {} for vread in bamfile.fetch(variantchrom, variantstart, variantend): - variantreads.append(DonorBamRead(vread.query_name, self.get_read_pair_num(vread), vread.reference_name, - vread.reference_start, vread.infer_read_length(), - vread.get_forward_sequence(), + variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), vread.reference_name, + vread.reference_start, vread.infer_read_length(), vread.cigarstring, + vread.next_reference_name, vread.next_reference_start, + vread.template_length, vread.get_forward_sequence(), "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), vread.mapping_quality)) rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] + + if self.read_is_hard_clipped(vread): + hardclipped_read_num += 1 variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) variantreads = self.uniqify_variant_reads(variantreads) return variantreads @@ -790,9 +795,10 @@ def fetch_mate_read(self, rnext, pnext, readid, bamfile): """ for bamread in bamfile.fetch(rnext, pnext, pnext + 1): if bamread.query_name == readid and bamread.reference_start == pnext: - return DonorBamRead(bamread.query_name, self.get_read_pair_num(bamread), bamread.reference_name, - bamread.reference_start, bamread.infer_read_length(), - bamread.get_forward_sequence(), + return DonorBamRead(bamread.query_name, bamread.flag, self.get_read_pair_num(bamread), + bamread.reference_name, bamread.reference_start, bamread.infer_read_length(), + bamread.cigarstring, bamread.next_reference_name, bamread.next_reference_start, + bamread.template_length, bamread.get_forward_sequence(), "".join([chr((x + 33)) for x in bamread.get_forward_qualities()]), bamread.mapping_quality) return None @@ -1851,3 +1857,30 @@ def shuffle_donor_reads(self, donorreads, s=2): random.seed(s) random.shuffle(shuffled_donor_reads) return shuffled_donor_reads + + def write_donor_output_bam(self, bamoutpath, donorreads): + """Writes a set of donor reads as a bam file. + + Parameters + ---------- + bamoutpath : str + Path and name to write the output to + donorreads : list of DonorBamRead + Donor reads to place in the output BAM file + """ + print("Constructing BAM file") + + def read_is_hard_clipped(self, fetchedread): + """Returns whether the provided read is hard-clipped + + Parameters + ---------- + fetchedread : pysam.AlignedSegment + Read fetched from alignment file with pysam + + Returns + ------- + bool + True if read is hardclipped, False if not + """ + return "H" in fetchedread.cigarstring From 9640b7fd3677d94017a89d06e68a95c4cee832e2 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 17 Sep 2019 16:21:34 +0200 Subject: [PATCH 28/90] Add reference end to DonorBamRead --- DonorBamRead.py | 15 ++++++++------- VaSeBuilder.py | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 72039c9..9dcf5eb 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -20,6 +20,8 @@ class DonorBamRead: Leftmost genomic position of the read bam_read_length : int Read length + bam_read_end_pos : int + Read rightmost position (determined by alignment) bam_read_cigar : str Read CIGAR string bam_read_rnext : str @@ -36,8 +38,8 @@ class DonorBamRead: Read MAPQ value """ - def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, readcigar, readrnext, readpnext, - readtlen, readseq, readquals, readmapq): + def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, readend, readcigar, readrnext, + readpnext, readtlen, readseq, readquals, readmapq): """Saves all required data of an aligned read from a BAM or CRAM file. Parameters @@ -75,6 +77,7 @@ def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, read self.bam_read_chrom = readchrom self.bam_read_ref_pos = readstart self.bam_read_length = readlen + self.bam_read_end_pos = 0 self.bam_read_cigar = readcigar self.bam_read_rnext = readrnext self.bam_read_pnext = readpnext @@ -146,16 +149,14 @@ def get_bam_read_length(self): return self.bam_read_length def get_bam_read_ref_end(self): - """Calculates and returns the rightmost genomic position of the read via leftmost position + read length. + """Returns the rightmost genomic position of the read. Returns ------- - bam_read_length : int + self.bam_read_end_pos : int The rightmost position of the read """ - if self.bam_read_length is not None: - return self.bam_read_ref_pos + self.bam_read_length - return -1 + return self.bam_read_end_pos def get_bam_read_cigar(self): """Returns the read alignment CIGAR string diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 2aba49c..2e805f0 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -705,7 +705,7 @@ def get_variant_reads(self, contextid, variantchrom, variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), vread.reference_name, vread.reference_start, vread.infer_read_length(), vread.cigarstring, vread.next_reference_name, vread.next_reference_start, - vread.template_length, vread.get_forward_sequence(), + vread.template_length, vread.reference_end, vread.get_forward_sequence(), "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), vread.mapping_quality)) rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] @@ -798,7 +798,7 @@ def fetch_mate_read(self, rnext, pnext, readid, bamfile): return DonorBamRead(bamread.query_name, bamread.flag, self.get_read_pair_num(bamread), bamread.reference_name, bamread.reference_start, bamread.infer_read_length(), bamread.cigarstring, bamread.next_reference_name, bamread.next_reference_start, - bamread.template_length, bamread.get_forward_sequence(), + bamread.template_length, bamread.reference_end, bamread.get_forward_sequence(), "".join([chr((x + 33)) for x in bamread.get_forward_qualities()]), bamread.mapping_quality) return None From 07e65e0395996e99d7831b7439f569062ebb43d4 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 17 Sep 2019 16:44:07 +0200 Subject: [PATCH 29/90] Dd counting of reads with hardclipped bases when fetching reads and mates --- DonorBamRead.py | 12 ++++++++++++ VaSeBuilder.py | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/DonorBamRead.py b/DonorBamRead.py index 9dcf5eb..2784447 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -241,6 +241,18 @@ def get_mapping_qual(self): """ return self.bam_read_map_qual + def read_has_hardclipped_bases(self): + """Returns whether the read has hardclipped bases. + + The CIGAR string of the read is checked whether it contains an 'H' (Hardclipped bases). + + Returns + ------- + bool + True if read CIGAR contains 'H', False if not + """ + return "H" in self.bam_read_cigar + # ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD=================== def get_average_qscore(self): """Calculates and returns the mean Q-Score of the read. diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 2e805f0..fc5a064 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -712,6 +712,7 @@ def get_variant_reads(self, contextid, variantchrom, if self.read_is_hard_clipped(vread): hardclipped_read_num += 1 + self.vaselogger.debug(f"Fetched {hardclipped_read_num} reads with hardclipped bases") variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) variantreads = self.uniqify_variant_reads(variantreads) return variantreads @@ -739,14 +740,18 @@ def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umat variantreadlist : list of DonorBamRead Updated list of reads and the added read mates """ + hardclipped_read_num = 0 for read1 in rpnextmap.values(): materead = self.fetch_mate_read(*read1, bamfile) if materead is not None: + if materead.read_has_hardclipped_bases(): + hardclipped_read_num += 1 variantreadlist.append(materead) else: if write_unm: self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.") umatelist.append(read1[2]) + self.vaselogger.debug(f"Fetched {hardclipped_read_num} read mates with hardclipped bases") return variantreadlist def uniqify_variant_reads(self, variantreads): From 0c4808511d886ee4e74c0f2de2d5829fc2977c19 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 17 Sep 2019 16:56:34 +0200 Subject: [PATCH 30/90] Add counts for duplicate and secondary reads when fetching first time --- VaSeBuilder.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index fc5a064..7a45e18 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -699,6 +699,9 @@ def get_variant_reads(self, contextid, variantchrom, Fetched reads and their read mates """ hardclipped_read_num = 0 + duplicate_read_num = 0 + secondary_read_num = 0 + variantreads = [] rpnext = {} for vread in bamfile.fetch(variantchrom, variantstart, variantend): @@ -712,6 +715,10 @@ def get_variant_reads(self, contextid, variantchrom, if self.read_is_hard_clipped(vread): hardclipped_read_num += 1 + if vread.is_duplicate: + duplicate_read_num += 1 + if vread.is_secondary: + secondary_read_num += 1 self.vaselogger.debug(f"Fetched {hardclipped_read_num} reads with hardclipped bases") variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) variantreads = self.uniqify_variant_reads(variantreads) From 8dcd1c8ca75fc3e3cfd6bd4df36aab763d316b15 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 18 Sep 2019 10:39:54 +0200 Subject: [PATCH 31/90] Save the provided readend value --- DonorBamRead.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 2784447..8c28131 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -77,7 +77,7 @@ def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, read self.bam_read_chrom = readchrom self.bam_read_ref_pos = readstart self.bam_read_length = readlen - self.bam_read_end_pos = 0 + self.bam_read_end_pos = readend self.bam_read_cigar = readcigar self.bam_read_rnext = readrnext self.bam_read_pnext = readpnext From dd606cf4d2828e91ad4c5ea91f9b460ce12dde43 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 18 Sep 2019 12:19:26 +0200 Subject: [PATCH 32/90] Fix DonorBamRead construction error, add extra check for hardclipped bases --- VaSeBuilder.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 7a45e18..2d7d815 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -705,10 +705,11 @@ def get_variant_reads(self, contextid, variantchrom, variantreads = [] rpnext = {} for vread in bamfile.fetch(variantchrom, variantstart, variantend): - variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), vread.reference_name, - vread.reference_start, vread.infer_read_length(), vread.cigarstring, - vread.next_reference_name, vread.next_reference_start, - vread.template_length, vread.reference_end, vread.get_forward_sequence(), + variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), + vread.reference_name, vread.reference_start, vread.infer_read_length(), + vread.reference_end, vread.cigarstring, vread.next_reference_name, + vread.next_reference_start, + vread.template_length, vread.get_forward_sequence(), "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), vread.mapping_quality)) rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] @@ -809,8 +810,9 @@ def fetch_mate_read(self, rnext, pnext, readid, bamfile): if bamread.query_name == readid and bamread.reference_start == pnext: return DonorBamRead(bamread.query_name, bamread.flag, self.get_read_pair_num(bamread), bamread.reference_name, bamread.reference_start, bamread.infer_read_length(), - bamread.cigarstring, bamread.next_reference_name, bamread.next_reference_start, - bamread.template_length, bamread.reference_end, bamread.get_forward_sequence(), + bamread.reference_end, bamread.cigarstring, bamread.next_reference_name, + bamread.next_reference_start, + bamread.template_length, bamread.get_forward_sequence(), "".join([chr((x + 33)) for x in bamread.get_forward_qualities()]), bamread.mapping_quality) return None @@ -1893,6 +1895,8 @@ def read_is_hard_clipped(self, fetchedread): Returns ------- bool - True if read is hardclipped, False if not + True if read is hardclipped, False if not or read has no cigar string """ - return "H" in fetchedread.cigarstring + if fetchedread.cigarstring is not None: + return "H" in fetchedread.cigarstring + return False From 938f3317e2031db3621b2ea5255e11bc5e002309 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 18 Sep 2019 12:46:08 +0200 Subject: [PATCH 33/90] Add extra hardclipping check for DonorBamRead --- DonorBamRead.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 8c28131..00a118f 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -251,7 +251,9 @@ def read_has_hardclipped_bases(self): bool True if read CIGAR contains 'H', False if not """ - return "H" in self.bam_read_cigar + if self.bam_read_cigar is not None: + return "H" in self.bam_read_cigar + return False # ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD=================== def get_average_qscore(self): From fa1543434975de472e59dc41f9cb8a68394d022f Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 18 Sep 2019 13:06:09 +0200 Subject: [PATCH 34/90] Add extra check when making stops in determine context --- VaSeBuilder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 2d7d815..4fcc8d9 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -872,7 +872,7 @@ def determine_context(self, contextreads, contextorigin, contextchr): stops = [conread.get_bam_read_ref_end() for conread in contextreads - if conread.get_bam_read_chrom() == contextchr] + if conread.get_bam_read_chrom() == contextchr and conread.get_bam_read_ref_end() is not None] # Number of mates filtered for mapping to different chr. num_diff_chr = len(contextreads) - len(starts) From b8b268a4acc7d38e68fc34e691ab1f48683bef02 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 18 Sep 2019 16:55:06 +0200 Subject: [PATCH 35/90] Small update to use bvcs methods --- VaSeBuilder.py | 22 +++++++++++++--------- VariantContext.py | 11 +++++++++++ vase.py | 11 +++++++---- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 4fcc8d9..7392671 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1279,7 +1279,6 @@ def write_bed_file(self, variantcontextdata, bedoutloc): except IOError: self.vaselogger.warning(f"Could not write variant context data to BED file: {bedoutloc}") - # Checks that thet sequence names are the same def check_sequence_names(self, referencefile, alignmentfile): """Checks and returns whether the chromosome names in the genome reference and alignment file are the same. @@ -1645,7 +1644,7 @@ def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath variantcontexts = VariantContextFile() try: - acceptorbamfile = pysam.AlignmentFile(acceptorbamloc, reference_name=reference_loc) + acceptorbamfile = pysam.AlignmentFile(acceptorbamloc, reference_filename=reference_loc) except IOError: self.vaselogger.critical("Could not open Acceptor BAM/CRAM") exit() @@ -1679,7 +1678,7 @@ def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath # Processes a sample def bvcs_process_sample(self, sampleid, variantcontextfile, abamfile, dbamfileloc, referenceloc, samplevariants): try: - donorbamfile = pysam.AlignmentFile(dbamfileloc, reference_name=referenceloc) + donorbamfile = pysam.AlignmentFile(dbamfileloc, reference_filename=referenceloc) except IOError: self.vaselogger.warning(f"Could not open {dbamfileloc} ; Skipping {sampleid}") return @@ -1691,6 +1690,12 @@ def bvcs_process_sample(self, sampleid, variantcontextfile, abamfile, dbamfilelo if not variantcontext: self.vaselogger.info(f"Could not establish variant context ; Skipping.") continue + # If this widest context overlaps an existing variant context, skip it. + if self.contexts.context_collision(variantcontext.get_context()): + self.vaselogger.debug(f"Variant context {variantcontext.get_variant_context_id()} overlaps with an" + f"already existing variant context; Skipping.") + continue + variantcontextfile.add_existing_variant_context(variantcontext) # Processes a sample variant by establishing the contexts. @@ -1727,13 +1732,12 @@ def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid self.vaselogger.debug(f"Variant context {variantid} overlaps with an already existing context; Skipping.") return None - vcontext_dreads = self.get_variant_reads(variantid, variantchrom, *vcontext_window, dbamfile, write_unm, - unmapped_dlist) - vcontext_areads = self.get_variant_reads(variantid, variantchrom, *vcontext_window, abamfile, write_unm, - unmapped_alist) + vcontext_dreads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3], + dbamfile, write_unm, unmapped_dlist) + vcontext_areads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3], + abamfile, write_unm, unmapped_alist) variant_context = VariantContext(variantid, sampleid, *vcontext_window, vcontext_areads, vcontext_dreads, - acontext, - dcontext) + acontext, dcontext) return variant_context # Establishes an acceptor/donor context by fetching reads (and their mates) overlapping directly with the variant diff --git a/VariantContext.py b/VariantContext.py index 443c9a1..ef0d0c5 100644 --- a/VariantContext.py +++ b/VariantContext.py @@ -81,6 +81,17 @@ def __init__(self, varconid, sampleid, self.unmapped_donor_mate_ids = [] # ===METHODS TO OBTAIN DATA FROM THE VARIANT CONTEXT DATA================== + def get_context(self): + """Returns the essential data of the variant context. + + Returns + ------- + list of str and int + Variant context window + """ + return [self.variant_context_chrom, self.variant_context_origin, self.variant_context_start, + self.variant_context_end] + def get_variant_context_id(self): """Returns the variant context identifier. diff --git a/vase.py b/vase.py index 049ecce..a30b621 100644 --- a/vase.py +++ b/vase.py @@ -293,10 +293,13 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): vcf_file_map = vbscan.scan_vcf_files(paramcheck.get_valid_vcf_filelist()) bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) sample_id_list = vbscan.get_complete_sample_ids() - varconfile = vaseb.build_varcon_set(sample_id_list, vcf_file_map, bam_file_map, - paramcheck.get_acceptor_bam(), paramcheck.get_out_dir_location(), - paramcheck.get_reference_file_location(), - paramcheck.get_variant_context_out_location(), variantfilter) + # varconfile = vaseb.build_varcon_set(sample_id_list, vcf_file_map, bam_file_map, + # paramcheck.get_acceptor_bam(), paramcheck.get_out_dir_location(), + # paramcheck.get_reference_file_location(), + # paramcheck.get_variant_context_out_location(), variantfilter) + varconfile = vaseb.bvcs(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), + paramcheck.get_out_dir_location(), paramcheck.get_reference_file_location(), + paramcheck.get_variant_context_out_location(), variantfilter) # Check for modes D,F,P if "D" in runmode: vaseb.run_d_mode(varconfile, paramcheck.get_fastq_out_location()) From 94ab311b6bc67b97f3a10c7d3466d88784640984 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 19 Sep 2019 13:21:03 +0200 Subject: [PATCH 36/90] Updates to make bvcs methods work --- VaSeBuilder.py | 142 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 137 insertions(+), 5 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 7392671..4a0ba7a 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1639,6 +1639,34 @@ def run_x_mode(self, sampleidlist, donorvcfs, donorbams, acceptorbam, genomerefe # =====SPLITTING THE BUILD_VARCON_SET() INTO MULTIPLE SMALLER METHODS===== def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath, reference_loc, varcon_outpath, variantlist): + """Builds and returns a variant context file. + + The variant context file is build from the provided variants and acceptor and donors alignment files. + + Parameters + ---------- + sampleidlist : list of str + Sample name/identifier + vcfsamplemap : dict + Variant files per sample + bamsamplemap : dict + Alignment files per sample + acceptorbamloc : str + Path to alignment file to use as acceptor + outpath : str + Path to folder to write output files to + reference_loc : str + Path to the genomic reference fasta file + varcon_outpath : str + Path and name to write variant context file to + variantlist : dict + Variants to use per sample + + Returns + ------- + variantcontexts : VariantContextFile + Established variant contexts + """ donor_vcfs_used = [] donor_bams_used = [] variantcontexts = VariantContextFile() @@ -1674,9 +1702,26 @@ def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath # Checks whether the program is running on debug and, if so, write some extra output files. if self.vaselogger.getEffectiveLevel() == 10: self.write_optional_output_files(outpath, variantcontexts) + return variantcontexts - # Processes a sample def bvcs_process_sample(self, sampleid, variantcontextfile, abamfile, dbamfileloc, referenceloc, samplevariants): + """Processes a sample and adds variant contexts to a variant context file. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantcontextfile : VariantContextFile + Variant context file that saves variant contexts + abamfile: pysam.AlignmentFile + Already opened pysam AlignmentFile to use as acceptor + dbamfileloc: str + Path to the alignment file to use as donor + referenceloc: str + Path to the genomic reference fasta file + samplevariants : list of VcfVariants + Variants to process for the specified sample + """ try: donorbamfile = pysam.AlignmentFile(dbamfileloc, reference_filename=referenceloc) except IOError: @@ -1690,16 +1735,38 @@ def bvcs_process_sample(self, sampleid, variantcontextfile, abamfile, dbamfilelo if not variantcontext: self.vaselogger.info(f"Could not establish variant context ; Skipping.") continue + # If this widest context overlaps an existing variant context, skip it. if self.contexts.context_collision(variantcontext.get_context()): self.vaselogger.debug(f"Variant context {variantcontext.get_variant_context_id()} overlaps with an" f"already existing variant context; Skipping.") continue - - variantcontextfile.add_existing_variant_context(variantcontext) + variantcontextfile.add_existing_variant_context(variantcontext.get_variant_context_id(), variantcontext) # Processes a sample variant by establishing the contexts. def bvcs_process_variant(self, sampleid, variantcontextfile, samplevariant, abamfile, dbamfile, write_unm=False): + """Processes a variant and returns the established variant context. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantcontextfile: VariantContextFile + Variant context file that saves the variant contexts + samplevariant : VcfVariant + Variant to process and for which to establish an accpetor, donor and variant context + abamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile to use as acceptor + dbamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile to use as donor + write_unm : bool + Whether to save read identifiers with unmapped read mates + + Returns + ------- + vcontext : VariantContext or None + The established variant context, None if context could not be established + """ # Establish the donor context variantid = samplevariant.get_variant_id() variantchrom = samplevariant.get_variant_chrom() @@ -1723,6 +1790,39 @@ def bvcs_process_variant(self, sampleid, variantcontextfile, samplevariant, abam # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid, variantchrom, variantpos, acontext, dcontext, abamfile, dbamfile, write_unm=False): + """Establishes and returns a variant context. + + The variant context window is established based on the acceptor and donor contexts. Reads and mates overlapping + with the variant context are also fetched and added to the variant context. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantcontextfile : VariantContextFile + Variant context file that saves variant contexts + variantid : str + Variant identifier + variantchrom : str + Chromosome the + variantpos: int + Leftmost genomic position of the variant + acontext : OverlapContext + Established acceptor context + dcontext : OverlapContext + Established donor context + abamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile used as acceptor + dbamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile used as donor + write_unm : bool + Whether to save identifiers of reads with unmapped mates + + Returns + ------- + variant_context : VariantContext + Established variant context with all associated data + """ unmapped_alist = [] unmapped_dlist = [] @@ -1738,12 +1838,44 @@ def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid abamfile, write_unm, unmapped_alist) variant_context = VariantContext(variantid, sampleid, *vcontext_window, vcontext_areads, vcontext_dreads, acontext, dcontext) + + # Set the variant context unmapped read ids + variant_context.set_unmapped_acceptor_mate_ids(unmapped_alist) + variant_context.set_unmapped_donor_mate_ids(unmapped_dlist) return variant_context # Establishes an acceptor/donor context by fetching reads (and their mates) overlapping directly with the variant def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, searchwindow, bamfile, write_unm=False, fallback_window=None): + """Establishes and returns an acceptor/donor context. + + The context window is established from fetched reads and their mates overlapping with the variant. + + Parameters + ---------- + sampleid : str + Sample name/identifier + variantid : str + Variant indentifier + variantchrom : str + Chromosome name the variant is located on + variantpos : int + Leftmost genomic position of the variant + searchwindow: list of int + Sreach window to use for fetching reads + bamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile + write_unm: bool + Save the read identifiers with unmapped mates + fallback_window : list + Window to set if no window could be set/determined + + Returns + ------- + OverlapContext or None + Acceptor/donor context if context is established, None if not + """ unmappedlist = [] # Fetch context reads and establish the context window @@ -1811,7 +1943,7 @@ def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, v """ # Write the variant context file itself self.vaselogger.info(f"Writing variant contexts to {varcon_outpath}") - variantcontextfile.write_variant_context_file(varcon_outpath) + variantcontextfile.write_variant_context_file(varcon_outpath, self.creation_id) # Write the variant context statistics file self.vaselogger.info(f"Writing variant context statistics to {outpath}varconstats.txt") @@ -1819,7 +1951,7 @@ def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, v # Write the variant contexts as a BED file self.vaselogger.info(f"Writing the variant contexts to a BED file") - self.write_bed_file(variantcontextfile.get_variant_contexts()) + self.write_bed_file(variantcontextfile.get_variant_contexts(), f"{outpath}variantcontexts.bed") # Write a listfile of the used variant (VCF/BCF) files self.vaselogger.info(f"Writing the used donor variant files per sample to {outpath}donor_variant_files.txt") From 208a4024db81b7c66f57b408758797bd7424b2df Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 23 Sep 2019 14:33:48 +0200 Subject: [PATCH 37/90] Add debug log messages --- VaSeBuilder.py | 44 +++++++++++++++++++++++++- VaSeUtils/SubsetVcfByVariantContext.py | 11 +++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 VaSeUtils/SubsetVcfByVariantContext.py diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 4a0ba7a..3373dec 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1060,7 +1060,7 @@ def is_required_read(self, bamread, fr): return bamread.is_read2() def set_fastq_out_path(self, outpath, fr, lnum): - """ + """Sets and returns the fastq output path and filename. Parameters ---------- @@ -1773,18 +1773,46 @@ def bvcs_process_variant(self, sampleid, variantcontextfile, samplevariant, abam variantpos = samplevariant.get_variant_pos() varianttype = samplevariant.get_variant_type() + self.vaselogger.debug(f"Processing variant {variantid}.") + self.debug_msg("vw", variantid) + self.vaselogger.debug(f"Variant {variantid} determined to be {varianttype}") searchwindow = self.determine_read_search_window(varianttype, samplevariant) + self.vaselogger.debug(f"Search window determined to be {variantchrom}:{searchwindow[0]+1}-{searchwindow[1]}") + + # Check whether the variant overlaps with an existing variant context + if variantcontextfile.variant_is_in_context(varianttype, variantchrom, *searchwindow): + self.vaselogger.debug(f"Variant {variantid} is located in an existing context.") + return None + # Determine the donor context. + self.debug_msg("dc", variantid) + t0 = time.time() dcontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, dbamfile, write_unm) if not dcontext: self.vaselogger.info(f"Could not establish donor context ; Skipping variant {variantid}") return None + self.debug_msg("dc", variantid, t0) + self.vaselogger.debug(f"Donor context determined to be {dcontext.get_context_chrom()}:" + f"{dcontext.get_context_start()}-{dcontext.get_context_end()}") + # Determine the acceptor context. + self.debug_msg("ac", variantid) + t0 = time.time() acontext = self.bvcs_establish_context(sampleid, variantid, variantchrom, variantpos, searchwindow, abamfile, write_unm, dcontext.get_context()) + self.debug_msg("ac", variantid, t0) + self.vaselogger.debug(f"Acceptor context determined to be {acontext.get_context_chrom()}:" + f"{acontext.get_context_start()}-{acontext.get_context_end()}") + + # Determin the variant context. + self.debug_msg("cc", variantid) + t0 = time.time() vcontext = self.bvcs_establish_variant_context(sampleid, variantcontextfile, variantid, variantchrom, variantpos, acontext, dcontext, abamfile, dbamfile, write_unm) + self.debug_msg("cc", variantid, t0) + self.vaselogger.debug(f"Combined context determined to be {vcontext.get_variant_context_chrom()}:" + f"{vcontext.get_variant_context_start()}:{vcontext.get_variant_context_end()}") return vcontext # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. @@ -1832,10 +1860,20 @@ def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid self.vaselogger.debug(f"Variant context {variantid} overlaps with an already existing context; Skipping.") return None + # Gather variant context donor reads. + self.debug_msg("cdr", variantid) + t0 = time.time() vcontext_dreads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3], dbamfile, write_unm, unmapped_dlist) + self.debug_msg("cdr", variantid, t0) + + # Gather variant context acceptor reads. + self.debug_msg("car", variantid) + t0 = time.time() vcontext_areads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3], abamfile, write_unm, unmapped_alist) + self.debug_msg("cdr", variantid, t0) + variant_context = VariantContext(variantid, sampleid, *vcontext_window, vcontext_areads, vcontext_dreads, acontext, dcontext) @@ -1879,13 +1917,17 @@ def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, unmappedlist = [] # Fetch context reads and establish the context window + self.vaselogger.debug("Fetching reads.") + t0 = time.time() context_reads = self.get_variant_reads(variantid, variantchrom, *searchwindow, bamfile, write_unm, unmappedlist) + self.vaselogger.debug(f"Fetching reads took {time.time() - t0} seconds") context_window = self.determine_context(context_reads, variantpos, variantchrom) # Check whether the context_window is valid and, if not, whether a fallback window has been set if not context_window: if fallback_window: context_window = fallback_window + self.vaselogger.debug("Context window is set to the fall back window") else: return None diff --git a/VaSeUtils/SubsetVcfByVariantContext.py b/VaSeUtils/SubsetVcfByVariantContext.py new file mode 100644 index 0000000..9168293 --- /dev/null +++ b/VaSeUtils/SubsetVcfByVariantContext.py @@ -0,0 +1,11 @@ +import logging +from VariantContextFile import VariantContextFile + +class SubsetVcfByVariantContext: + def __init__(self): + print("aap") + + def main(self, varconinloc): + varconfile = VariantContextFile(varconinloc) + + From 41c11b453eb3dab94310b3be819af8b3126c73b3 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 23 Sep 2019 14:42:35 +0200 Subject: [PATCH 38/90] Add some more debug log messages --- VaSeBuilder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 3373dec..7670c1a 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1921,13 +1921,17 @@ def bvcs_establish_context(self, sampleid, variantid, variantchrom, variantpos, t0 = time.time() context_reads = self.get_variant_reads(variantid, variantchrom, *searchwindow, bamfile, write_unm, unmappedlist) self.vaselogger.debug(f"Fetching reads took {time.time() - t0} seconds") + + if not context_reads: + self.vaselogger.debug("No reads were found.") context_window = self.determine_context(context_reads, variantpos, variantchrom) # Check whether the context_window is valid and, if not, whether a fallback window has been set if not context_window: + self.vaselogger.debug("Context window could not be determined.") if fallback_window: context_window = fallback_window - self.vaselogger.debug("Context window is set to the fall back window") + self.vaselogger.debug("Context window is set to the fall back window.") else: return None From 24f4ab6b4caa5133c4c09b63b4589ab473747ea9 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 23 Sep 2019 15:42:24 +0200 Subject: [PATCH 39/90] Update example config files, add one for debug and one for variantlist --- config/examples/example_amode.cfg | 6 +++--- config/examples/example_dcmode.cfg | 1 + config/examples/example_dmode.cfg | 1 + config/examples/example_fcmode.cfg | 1 + config/examples/example_fmode.cfg | 1 + config/examples/example_fmode_debug.cfg | 11 +++++++++++ config/examples/example_fmode_variantlist.cfg | 12 ++++++++++++ config/examples/example_pcmode.cfg | 2 +- config/examples/example_pmode.cfg | 2 +- config/examples/example_xcmode.cfg | 2 +- config/examples/example_xmode.cfg | 2 +- 11 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 config/examples/example_fmode_debug.cfg create mode 100644 config/examples/example_fmode_variantlist.cfg diff --git a/config/examples/example_amode.cfg b/config/examples/example_amode.cfg index fbc71c0..90fed3e 100644 --- a/config/examples/example_amode.cfg +++ b/config/examples/example_amode.cfg @@ -1,8 +1,8 @@ -#Example config file for running VaSeBuilder in A-mode -RUNMODE=A +#Example config file for running VaSeBuilder in AC-mode +RUNMODE=AC TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq VARCONIN=/testdata/variantcontext/varcon.txt DONORFASTQS=/testdata/donorfastq/donorfastq_list.txt OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_dcmode.cfg b/config/examples/example_dcmode.cfg index c50f543..f9b1724 100644 --- a/config/examples/example_dcmode.cfg +++ b/config/examples/example_dcmode.cfg @@ -6,3 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fasta VARCONIN=/testdata/variantcontext/varcon.txt OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_dmode.cfg b/config/examples/example_dmode.cfg index 41c201d..26b667b 100644 --- a/config/examples/example_dmode.cfg +++ b/config/examples/example_dmode.cfg @@ -5,3 +5,4 @@ DONORBAM= ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_fcmode.cfg b/config/examples/example_fcmode.cfg index 244766b..fd6584a 100644 --- a/config/examples/example_fcmode.cfg +++ b/config/examples/example_fcmode.cfg @@ -8,3 +8,4 @@ TEMPLATEFQ2=/testdata/templatefastq/acceptor_R1.fq VARCONIN=/testdata/variantcontext/varcon.txt REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_fmode.cfg b/config/examples/example_fmode.cfg index 5c7268c..3f2c6f8 100644 --- a/config/examples/example_fmode.cfg +++ b/config/examples/example_fmode.cfg @@ -7,3 +7,4 @@ TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log \ No newline at end of file diff --git a/config/examples/example_fmode_debug.cfg b/config/examples/example_fmode_debug.cfg new file mode 100644 index 0000000..0a3371f --- /dev/null +++ b/config/examples/example_fmode_debug.cfg @@ -0,0 +1,11 @@ +#Example config file for running VaSeBuilder in F-mode with debug set on +RUNMODE=F +DONORVCF=/testdata/donordata/vcffile_list.txt +DONORBAM=/testdata/donordata/bamfile_list.txt +ACCEPTORBAM=/testdata/acceptor/acceptor.bam +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq +REFERENCE=/testdata/reference/genome_ref.fasta +OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log +DEBUG=T \ No newline at end of file diff --git a/config/examples/example_fmode_variantlist.cfg b/config/examples/example_fmode_variantlist.cfg new file mode 100644 index 0000000..f7b133d --- /dev/null +++ b/config/examples/example_fmode_variantlist.cfg @@ -0,0 +1,12 @@ +#Example config file for running VaSeBuilder in F-mode +#This example has a provided list of variants to use for the run +RUNMODE=F +DONORVCF=/testdata/donordata/vcffile_list.txt +DONORBAM=/testdata/donordata/bamfile_list.txt +ACCEPTORBAM=/testdata/acceptor/acceptor.bam +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq +REFERENCE=/testdata/reference/genome_ref.fasta +OUT=/testdata/outdir +LOG=/testdata/outdir/vb.log +VARIANTLIST=/testdata/variants/variantlist.txt diff --git a/config/examples/example_pcmode.cfg b/config/examples/example_pcmode.cfg index 828d906..668ab65 100644 --- a/config/examples/example_pcmode.cfg +++ b/config/examples/example_pcmode.cfg @@ -6,4 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fasta VARCONIN=/testdata/variantcontext/varcon.txt OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_pmode.cfg b/config/examples/example_pmode.cfg index 81c1651..39377dd 100644 --- a/config/examples/example_pmode.cfg +++ b/config/examples/example_pmode.cfg @@ -5,4 +5,4 @@ DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fa OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_xcmode.cfg b/config/examples/example_xcmode.cfg index aa5854b..3bcdb66 100644 --- a/config/examples/example_xcmode.cfg +++ b/config/examples/example_xcmode.cfg @@ -6,4 +6,4 @@ ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fa VARCONIN=/testdata/variantcontext/varcon.txt OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log diff --git a/config/examples/example_xmode.cfg b/config/examples/example_xmode.cfg index 207199d..8ae6adc 100644 --- a/config/examples/example_xmode.cfg +++ b/config/examples/example_xmode.cfg @@ -5,4 +5,4 @@ DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fa OUT=/testdata/outdir -LOGFILE=/testdata/outdir/vb.log +LOG=/testdata/outdir/vb.log From cfaa49a88e5fd67fe3eaab948a9e795ba936a423 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 24 Sep 2019 11:21:50 +0200 Subject: [PATCH 40/90] Uopdate example config files, add input file description md document --- config/examples/example_dmode.cfg | 2 +- config/examples/example_fcmode.cfg | 4 +-- config/examples/example_fmode.cfg | 4 +-- docs/input_formats.md | 54 ++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 docs/input_formats.md diff --git a/config/examples/example_dmode.cfg b/config/examples/example_dmode.cfg index 26b667b..92a332d 100644 --- a/config/examples/example_dmode.cfg +++ b/config/examples/example_dmode.cfg @@ -1,7 +1,7 @@ #Example config file for running VaSeBuilder in D-mode RUNMODE=D DONORVCF=/testdata/donordata/vcffiles_list.txt -DONORBAM= +DONORBAM=/testdata/donordata/bamfiles_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir diff --git a/config/examples/example_fcmode.cfg b/config/examples/example_fcmode.cfg index fd6584a..532c21b 100644 --- a/config/examples/example_fcmode.cfg +++ b/config/examples/example_fcmode.cfg @@ -3,8 +3,8 @@ RUNMODE=FC DONORVCF=/testdata/donordata/vcffile_list.txt DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam -TEMPLATEFQ1=/testdata/templatefastq/acceptor_R2.fq -TEMPLATEFQ2=/testdata/templatefastq/acceptor_R1.fq +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1_L1.fq , /testdata/templatefastq/acceptor_R1_L2.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2_L1.fq , /testdata/templatefastq/acceptor_R2_L2.fq VARCONIN=/testdata/variantcontext/varcon.txt REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir diff --git a/config/examples/example_fmode.cfg b/config/examples/example_fmode.cfg index 3f2c6f8..65015c4 100644 --- a/config/examples/example_fmode.cfg +++ b/config/examples/example_fmode.cfg @@ -3,8 +3,8 @@ RUNMODE=F DONORVCF=/testdata/donordata/vcffile_list.txt DONORBAM=/testdata/donordata/bamfile_list.txt ACCEPTORBAM=/testdata/acceptor/acceptor.bam -TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1.fq -TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2.fq +TEMPLATEFQ1=/testdata/templatefastq/acceptor_R1_L1.fq , /testdata/templatefastq/acceptor_R1_L2.fq +TEMPLATEFQ2=/testdata/templatefastq/acceptor_R2_L1.fq , /testdata/templatefastq/acceptor_R2_L2.fq REFERENCE=/testdata/reference/genome_ref.fasta OUT=/testdata/outdir LOG=/testdata/outdir/vb.log \ No newline at end of file diff --git a/docs/input_formats.md b/docs/input_formats.md new file mode 100644 index 0000000..c6654b2 --- /dev/null +++ b/docs/input_formats.md @@ -0,0 +1,54 @@ +# VaSeBuilder input file formats + +## About input files +To run VaSeBuilder specific VaSeBuilder input files might need to be provided depending on the selected run mode. + + +### Donor variant and donor alignment list files + + +### Donor fastq list file +A donor fastq list file needs to be provided when running VaSeBuilder AC-mode. Currently (September 23rd, 2019) this +file requires two columns without a header for. The first column should contain the paths to R1 files, the second column + the paths to R2 files. + + +sample1_R1.fq  sample1_R2.fq
+sample2_R1.fq  sample2_R2.fq + + +### Variant list file + + +### Variant context file +A VaSeBuilder variant context file can serve as an input file when running AC, DC, FC or PC mode. Please see the output +files description for the variant context file. + + +### Configuration file +Configuration files allow users to save the VaSeBuilder run parameters and values in a text file. This makes rerunning +analyses on the same data, as well as sharing the run parameters much easier. Parameters and values can be specified as +PARAMETERNAME=value. Parameter names do not need to be capitalized, entries such as parametername = value are also +valid. Parameters templatefq1 and templatefq2 can have multiple values separated by a comma: +templatefq1 = testdata/fastqs/acceptor1_1.fq , testdata/fastqs. (The space around the comma is optional) +Comments explaining or describing the use of the configuration file can be added by starting a line with #. + +Valid configuration file parameter names: +* __RUNMODE:__ The run mode of VaSeBuilder to use +* __DONORVCF:__ Path to file with list of donor variant files to use +* __DONORBAM:__ Path to file with list of donor alignment files to use +* __ACCEPTORBAM:__ Path the alignment file to use as acceptor +* __TEMPLATEFQ1:__ Path(s) to R1 fastq file(s) to use as template for building the validation set +* __TEMPLATEFQ2:__ Path(s) to R2 fastq file(s) to use as template for building the validation set +* __REFERENCE:__ Path to the fasta genome reference +* __OUT:__ Path to directory to write output files to +* __LOG:__ Path to write log file to +* __DEBUG:__ Run debug mode (valid values: T, True, true, TRUE, 1). +* __VARIANTLIST:__ Path to file containing variants to use +* __VARCONIN:__ Path to a variant context file +* __FASTQOUT:__ Name prefix for validation fastq files +* __VARCON:__ Name prefix for variant context output file + + +To run VaSeBuilder with a configuration file use: python vase.py –c path/to/con fig/file.cfg +Please see the example config files in VaSeBuilder/config/examples for examples per run mode From 9270dd28e69429eaed0152766fedf65dd9cf0c25 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 24 Sep 2019 14:07:22 +0200 Subject: [PATCH 41/90] Update input formats and add output formats md file --- docs/input_formats.md | 20 ++++++++----- docs/output_formats.md | 64 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 docs/output_formats.md diff --git a/docs/input_formats.md b/docs/input_formats.md index c6654b2..a062637 100644 --- a/docs/input_formats.md +++ b/docs/input_formats.md @@ -5,24 +5,31 @@ To run VaSeBuilder specific VaSeBuilder input files might need to be provided de ### Donor variant and donor alignment list files +The donor variant list file should contain a list of paths to a variant, VCF or BCF, file with one path per file line. +The donor alignment list file should be similar and contain one path to a donor alignment file per file line. +The files listed will be used as donor variant and alignments files respectively when constructing variant contexts. +

### Donor fastq list file A donor fastq list file needs to be provided when running VaSeBuilder AC-mode. Currently (September 23rd, 2019) this -file requires two columns without a header for. The first column should contain the paths to R1 files, the second column - the paths to R2 files. +file requires two columns without a header. The first column should contain the paths to R1 files, the second column + the paths to R2 files: +_/path/to/sample1_R1.fq  /path/to/sample1_R2.fq
+/path/to/sample2_R1.fq  /path/to/sample2_R2.fq
+/path/to/sample3_R1.fq  /path/to/sample3_R2.fq_ -sample1_R1.fq  sample1_R2.fq
-sample2_R1.fq  sample2_R2.fq +Each row in this file therefore represents a single sample.

### Variant list file + ### Variant context file A VaSeBuilder variant context file can serve as an input file when running AC, DC, FC or PC mode. Please see the output -files description for the variant context file. +files description for the variant context file.

### Configuration file @@ -33,7 +40,7 @@ valid. Parameters templatefq1 and templatefq2 can have multiple values separated templatefq1 = testdata/fastqs/acceptor1_1.fq , testdata/fastqs. (The space around the comma is optional) Comments explaining or describing the use of the configuration file can be added by starting a line with #. -Valid configuration file parameter names: +
Valid configuration file parameter names: * __RUNMODE:__ The run mode of VaSeBuilder to use * __DONORVCF:__ Path to file with list of donor variant files to use * __DONORBAM:__ Path to file with list of donor alignment files to use @@ -49,6 +56,5 @@ Valid configuration file parameter names: * __FASTQOUT:__ Name prefix for validation fastq files * __VARCON:__ Name prefix for variant context output file - To run VaSeBuilder with a configuration file use: python vase.py –c path/to/con fig/file.cfg Please see the example config files in VaSeBuilder/config/examples for examples per run mode diff --git a/docs/output_formats.md b/docs/output_formats.md new file mode 100644 index 0000000..95a745f --- /dev/null +++ b/docs/output_formats.md @@ -0,0 +1,64 @@ +# VaSeBuilder output file formats + +## Standard output files +VaSeBuilder runs output several output files containing the essential data about the run. Some files are only outputted +when running specific modes. + +### Donor alignment files +A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. + +### Donor variant files +A list of paths to variant (VCF/BCF) files used in building the variant contexts. + +### Log file +Contains a log of steps and activities performed by VaSeBuilder. More info is added to the log when debug is activated. + +### P-mode link file +When VaSeBuilder is run in P-mode, an extra file is created consisting of three columns + +### Validation fastq files +Validation fastq files are valid fastq based on the provided template fastq files with acceptor and donor reads +exchanged based on the variant contexts. + +### Variant context file +The variant context file is a tab separated file containing the essential data of the variant contexts in 13 columns. +The context data is preceded by a header starting with a #.
+ +Variant context file columns: +* __ContextId:__ The identifier of the context (consists of the chromosome name and variant positions connected with +an '_') +* __DonorSample:__ The sample name/identifier of the variant from which the context is constructed. +* __Chrom:__ The name of the chromosome the context is located on. +* __Origin:__ This is the genomic position of the donor variant the acceptor, donor and variant context +* __Start:__ The starting, or leftmost genomic, position of the variant context. +* __End:__ The ending, or rightmost genomic, position of the variant context. +* __AcceptorContextLength:__ The length of the acceptor context that constructed the variant context. +* __DonorContextLength:__ The length of the donor context that constructed the variant context. +* __AcceptorReads:__ The number of acceptor reads and mates overlapping with the variant context. +* __DonorReads:__ The number of donor reads and mates overlapping with the variant context. +* __ADratio:__ The ratio of acceptor reads over donor reads that will be exchanged. +* __AcceptorReadIds:__ The read identifiers (one per mate pair) overlapping with the variant context. +* __DonorReadsIds:__ The read identifiers (one per mate pair) overlapping with the variant context. + +The first line in each variant context file is the uuid of the VaSeBuilder run that constructed the variant contexts. +
+ +### Variant context statistics file + +### Variant context bed file + + + +## Debug output files +### Acceptor contexts file +### Acceptor context read positions file +### Acceptor context statistics file +### Acceptor context unmapped read mates file +### Donor contexts file +### Donor context read positions file +### Donor context statistics file +### Donor context unmapped read mates file +### Variant context acceptor read positions file +### Variant context donor read positions file +### Variant context unmapped acceptor read mates file +### Variant context unmapped donor read mate file From ea27136e428f48744435d328bf41112906d346b3 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 24 Sep 2019 14:23:57 +0200 Subject: [PATCH 42/90] Small changes to input formats --- docs/input_formats.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/input_formats.md b/docs/input_formats.md index a062637..b51764e 100644 --- a/docs/input_formats.md +++ b/docs/input_formats.md @@ -1,7 +1,8 @@ # VaSeBuilder input file formats ## About input files -To run VaSeBuilder specific VaSeBuilder input files might need to be provided depending on the selected run mode. +To run VaSeBuilder, specific VaSeBuilder input files might need to be provided depending on the selected run mode.
+
### Donor variant and donor alignment list files @@ -24,12 +25,20 @@ Each row in this file therefore represents a single sample.

### Variant list file +A variant list file can be provided as a filter to indicate which donor variants to use in the VaSeBuilder run. This is +most useful when only certain variants from the donors are required but the donor variant files contain many variants. +Variants not specified in the variant list will therefore be skipped. +The variant list file needs at least three columns: sample name/identifier, chromosome name, variant position: +_Sample1  1  101
+Sample2  2  202
_ + +(In the future two more columns might be required: variant reference allele, variant alternative allele)

### Variant context file A VaSeBuilder variant context file can serve as an input file when running AC, DC, FC or PC mode. Please see the output -files description for the variant context file.

+files description for the variant context file for it's structure.

### Configuration file @@ -37,7 +46,7 @@ Configuration files allow users to save the VaSeBuilder run parameters and value analyses on the same data, as well as sharing the run parameters much easier. Parameters and values can be specified as PARAMETERNAME=value. Parameter names do not need to be capitalized, entries such as parametername = value are also valid. Parameters templatefq1 and templatefq2 can have multiple values separated by a comma: -templatefq1 = testdata/fastqs/acceptor1_1.fq , testdata/fastqs. (The space around the comma is optional) +templatefq1 = testdata/fastqs/acceptor1_1.fq , testdata/fastqs/acceptor1_2.fq. (The space around the comma is optional) Comments explaining or describing the use of the configuration file can be added by starting a line with #.
Valid configuration file parameter names: @@ -56,5 +65,5 @@ Comments explaining or describing the use of the configuration file can be added * __FASTQOUT:__ Name prefix for validation fastq files * __VARCON:__ Name prefix for variant context output file -To run VaSeBuilder with a configuration file use: python vase.py –c path/to/con fig/file.cfg +To run VaSeBuilder with a configuration file use: python vase.py –c path/to/config/file.cfg Please see the example config files in VaSeBuilder/config/examples for examples per run mode From b45b0bb2804bd45605f28fbe17b81768cba78077 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 24 Sep 2019 15:43:42 +0200 Subject: [PATCH 43/90] Addd more documentation on output files --- docs/output_formats.md | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/docs/output_formats.md b/docs/output_formats.md index 95a745f..c408697 100644 --- a/docs/output_formats.md +++ b/docs/output_formats.md @@ -4,25 +4,50 @@ VaSeBuilder runs output several output files containing the essential data about the run. Some files are only outputted when running specific modes. + + ### Donor alignment files A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. + + ### Donor variant files A list of paths to variant (VCF/BCF) files used in building the variant contexts. + + ### Log file Contains a log of steps and activities performed by VaSeBuilder. More info is added to the log when debug is activated. +The log file contains four fields separated by a tab: Date and time, name of the logger (VaSe_Logger), log level, +message. The first two lines of the log always display the issued command to run VaSeBuilder and the VaSeBuilder uuid +and creation date and time A VaSeBuilder log file therefore follows the format:
+ +YYYY-MM-DD  HH:MM:SS,SSS  INFO  python vase.py -c test.cfg
+YYYY-MM-DD  HH:MM:SS,SSS  INFO  VaSeBuilder: {uuid} ; YYYY-MM-DD HH:MM:SS,SSS
+YYYY-MM-DD  HH:MM:SS,SSS  INFO  Running VaSeBuilder in F-mode +

### P-mode link file -When VaSeBuilder is run in P-mode, an extra file is created consisting of three columns +When VaSeBuilder is run in P-mode, an extra file is created consisting of three columns: + +_contextid_1  /path/to/fastq_R1.fq  /path/to/fastq_R1.fq
+contextid_2  /path/to/fastq_R2.fq  /path/to/fastq_R2.fq_
+ +Each line represents a variant context. The first line in the P-mode link file is the uuid of the VaSeBuilder. This +is used to the variant context file and the fastq files for a certain P-mode run.

+ ### Validation fastq files Validation fastq files are valid fastq based on the provided template fastq files with acceptor and donor reads -exchanged based on the variant contexts. +exchanged based on the variant contexts. Currently (September 24th, 2019) donor reads are added at the end of the +validation fastq files.

+ + ### Variant context file The variant context file is a tab separated file containing the essential data of the variant contexts in 13 columns. -The context data is preceded by a header starting with a #.
+The context data is preceded by a header starting with a #. The default output name for the variant context file is +_varcon.txt_.

Variant context file columns: * __ContextId:__ The identifier of the context (consists of the chromosome name and variant positions connected with @@ -41,12 +66,20 @@ an '_') * __DonorReadsIds:__ The read identifiers (one per mate pair) overlapping with the variant context. The first line in each variant context file is the uuid of the VaSeBuilder run that constructed the variant contexts. -
+

+ ### Variant context statistics file +The variant context statistics file contains some statistics about the reads and mates overlapping with the variant +contexts.

+ ### Variant context bed file +A BED file with the variant context entries in four columns: Chromosome name, context start, context end, context +identifier. The output name for the BED file is variantcontexts.bed. The resulting file looks like:
+_1  100  1000  1_500
+2  200  2000  2_1000_

## Debug output files From 8918b18864ccb99dabed394944c248ca841e0d91 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 24 Sep 2019 16:52:07 +0200 Subject: [PATCH 44/90] Small updates to input and out format md files --- docs/input_formats.md | 3 +- docs/output_formats.md | 69 ++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/docs/input_formats.md b/docs/input_formats.md index b51764e..55d003d 100644 --- a/docs/input_formats.md +++ b/docs/input_formats.md @@ -9,7 +9,8 @@ To run VaSeBuilder, specific VaSeBuilder input files might need to be provided d The donor variant list file should contain a list of paths to a variant, VCF or BCF, file with one path per file line. The donor alignment list file should be similar and contain one path to a donor alignment file per file line. The files listed will be used as donor variant and alignments files respectively when constructing variant contexts. -

+Only paths to the variant/alignment files should be provided, not the index files. These are assumed to be in the same +directory with the same name, i.e.: sample.bam & sample.bam.bai

### Donor fastq list file diff --git a/docs/output_formats.md b/docs/output_formats.md index c408697..714a38c 100644 --- a/docs/output_formats.md +++ b/docs/output_formats.md @@ -2,18 +2,20 @@ ## Standard output files VaSeBuilder runs output several output files containing the essential data about the run. Some files are only outputted -when running specific modes. +when running specific modes.

### Donor alignment files A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. - +

### Donor variant files -A list of paths to variant (VCF/BCF) files used in building the variant contexts. - +A list of paths to variant (VCF/BCF) files used in building the variant contexts. A list could be:

+_/path/to/donor1.bam
+/path/to/donor2.cram
+/path/to/donor3.bam_

### Log file @@ -33,7 +35,7 @@ When VaSeBuilder is run in P-mode, an extra file is created consisting of three _contextid_1  /path/to/fastq_R1.fq  /path/to/fastq_R1.fq
contextid_2  /path/to/fastq_R2.fq  /path/to/fastq_R2.fq_
-Each line represents a variant context. The first line in the P-mode link file is the uuid of the VaSeBuilder. This +Each line represents a variant context. The first line in the P-mode link file is the uuid of the VaSeBuilder run. This is used to the variant context file and the fastq files for a certain P-mode run.

@@ -45,9 +47,10 @@ validation fastq files.

### Variant context file -The variant context file is a tab separated file containing the essential data of the variant contexts in 13 columns. -The context data is preceded by a header starting with a #. The default output name for the variant context file is -_varcon.txt_.

+The variant context file is a tab separated file containing the essential data of the variant contexts in 13 columns. +This file is one of the important output files as it contains the windows created to search reads and which acceptor +reads are exchanged for which donor reads. The context data is preceded by a header starting with a #. The default +output name for the variant context file is _varcon.txt_.

Variant context file columns: * __ContextId:__ The identifier of the context (consists of the chromosome name and variant positions connected with @@ -65,8 +68,15 @@ an '_') * __AcceptorReadIds:__ The read identifiers (one per mate pair) overlapping with the variant context. * __DonorReadsIds:__ The read identifiers (one per mate pair) overlapping with the variant context. -The first line in each variant context file is the uuid of the VaSeBuilder run that constructed the variant contexts. -

+The first line in each variant context file is the uuid of the VaSeBuilder run that constructed the variant contexts. A +variant context file looks like:
+ +_#VBUUID: {uuid}
+\#ContextId  DonorSample  Chrom  Origin  Start  End  AcceptorContextLength + DonorContextLength  AcceptorReads  DonorReads  ADratio  AcceptorReadsIds + DonorReadIds
+1_100  Sample1  1  100  50  150  75  75  2  2  1.0 + aReadId1,aReadId2  dReadId1,dReadId2

_ ### Variant context statistics file @@ -83,15 +93,30 @@ _1  100  1000  1_500
## Debug output files -### Acceptor contexts file -### Acceptor context read positions file -### Acceptor context statistics file -### Acceptor context unmapped read mates file -### Donor contexts file -### Donor context read positions file -### Donor context statistics file -### Donor context unmapped read mates file -### Variant context acceptor read positions file -### Variant context donor read positions file -### Variant context unmapped acceptor read mates file -### Variant context unmapped donor read mate file +When debug is set to True multiple extra files are outputted as well providing more in depth information.

+ + +### Acceptor/Donor contexts file +The acceptor/donor context file is essentially a variant context but for acceptor or donor contexts respectively. The +acceptor/donor context file differs in the number of columns and only has two additional columns after ContextId - End, + namely NumOfReads and ReadIds. +

+ +### Acceptor/Donor context read positions file + + +### Acceptor/Donor context statistics file + + +### Acceptor/Donor context unmapped read mates file + + +### Variant context acceptor/donor read positions file + + +### Variant context unmapped acceptor/donor read mates file +Records the identifiers of reads that have an unmapped read mate in a tab separated manner. The file contains three +columns: Context identifier, sample name/identifier, read identifiers.

+ +_#ContextId  SampleId  ReadIds
+1_100 Sample1  uReadId1,uReadId2,uReadId3_ From e586155a5a1da5244fd1cd820dfab3994d597073 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 25 Sep 2019 15:24:27 +0200 Subject: [PATCH 45/90] More to output format doc --- docs/output_formats.md | 64 +++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/docs/output_formats.md b/docs/output_formats.md index 714a38c..b440231 100644 --- a/docs/output_formats.md +++ b/docs/output_formats.md @@ -7,15 +7,19 @@ when running specific modes.

### Donor alignment files -A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. +A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. A list could be:
+_/path/to/donor1.bam
+/path/to/donor2.cram
+/path/to/donor3.bam_

### Donor variant files -A list of paths to variant (VCF/BCF) files used in building the variant contexts. A list could be:

-_/path/to/donor1.bam
-/path/to/donor2.cram
-/path/to/donor3.bam_

+A list of paths to donor variant (VCF/BCF) files used in building the variant contexts. A list could be:

+_/path/to/donor1.vcf
+/path/to/donor2.bcf
+/path/to/donor3.vcf_ +

### Log file @@ -24,13 +28,15 @@ The log file contains four fields separated by a tab: Date and time, name of the message. The first two lines of the log always display the issued command to run VaSeBuilder and the VaSeBuilder uuid and creation date and time A VaSeBuilder log file therefore follows the format:
-YYYY-MM-DD  HH:MM:SS,SSS  INFO  python vase.py -c test.cfg
+_YYYY-MM-DD  HH:MM:SS,SSS  INFO  python vase.py -c test.cfg
YYYY-MM-DD  HH:MM:SS,SSS  INFO  VaSeBuilder: {uuid} ; YYYY-MM-DD HH:MM:SS,SSS
-YYYY-MM-DD  HH:MM:SS,SSS  INFO  Running VaSeBuilder in F-mode +YYYY-MM-DD  HH:MM:SS,SSS  INFO  Running VaSeBuilder in F-mode_

+ ### P-mode link file -When VaSeBuilder is run in P-mode, an extra file is created consisting of three columns: +When VaSeBuilder is run in P-mode, donor fastq files are outputted for each variant context. To link the variant +context file and the outputted donor fastq files, an extra file is created consisting of three columns:
_contextid_1  /path/to/fastq_R1.fq  /path/to/fastq_R1.fq
contextid_2  /path/to/fastq_R2.fq  /path/to/fastq_R2.fq_
@@ -42,8 +48,8 @@ is used to the variant context file and the fastq files for a certain P-mode run ### Validation fastq files Validation fastq files are valid fastq based on the provided template fastq files with acceptor and donor reads exchanged based on the variant contexts. Currently (September 24th, 2019) donor reads are added at the end of the -validation fastq files.

- +validation fastq files. +

### Variant context file @@ -76,7 +82,8 @@ _#VBUUID: {uuid}
 DonorContextLength  AcceptorReads  DonorReads  ADratio  AcceptorReadsIds  DonorReadIds
1_100  Sample1  1  100  50  150  75  75  2  2  1.0 - aReadId1,aReadId2  dReadId1,dReadId2

_ + aReadId1,aReadId2  dReadId1,dReadId2_ +

### Variant context statistics file @@ -93,25 +100,48 @@ _1  100  1000  1_500
## Debug output files -When debug is set to True multiple extra files are outputted as well providing more in depth information.

+When debug is set to True multiple extra files are outputted as well providing more in depth information. +

### Acceptor/Donor contexts file -The acceptor/donor context file is essentially a variant context but for acceptor or donor contexts respectively. The -acceptor/donor context file differs in the number of columns and only has two additional columns after ContextId - End, - namely NumOfReads and ReadIds. -

+The acceptor/donor context file is essentially a variant context file but for acceptor or donor contexts respectively. +The acceptor/donor context file differs in the number of columns and only has two additional columns after ContextId - +End, namely NumOfReads and ReadIds. -### Acceptor/Donor context read positions file +_VBUUID: {uuid}
+\#ContextId  DonorSample  Chrom  Origin  Start  End  AcceptorContextLength + DonorContextLength  NumOfReads  ReadIds
+1_100  Sample1  1  100  50  150  2  dReadId1,dReadId2_ +

### Acceptor/Donor context statistics file +Similar to the variant context statistics file, the acceptor/donor context statistics file contains basic statistics of +the reads overlapping with acceptor or donor contexts. +

+ + +### Acceptor/Donor context read positions file +The read positions contains the leftmost genomic positions of all R1 reads and rightmost genomic positions of all R2 +reads in all acceptor or donor contexts. These positions can be plotted, if needed, to gain more insight into the outer +ranges of the reads associated with a variant context. The file contains three columns: context identifier, left +positions, right positions. Each leftmost and rightmost position is separated by a comma.

+_#ContextId  LeftPos  RightPos
+1_100  25,50,75  75,50,25_ +

### Acceptor/Donor context unmapped read mates file +Reads overlapping with the acceptor/donor contexts with unmapped read mates encountered during a VaSeBuilder run are +written the the acceptor_unmapped.txt or donor_unmapped.txt +

### Variant context acceptor/donor read positions file +Much like the acceptor/donor read positions the variant context acceptor positions and variant context donor positions +files save the leftmost R1 and rightmost R2 acceptor and donor read positions respectively. +

### Variant context unmapped acceptor/donor read mates file From 7fbae5b877e450dd70f1f54b47b88bfefd4a7b69 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 25 Sep 2019 17:02:23 +0200 Subject: [PATCH 46/90] Further descriptions to input and output formats --- docs/input_formats.md | 12 +++++--- docs/output_formats.md | 63 ++++++++++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/docs/input_formats.md b/docs/input_formats.md index 55d003d..5c6f83f 100644 --- a/docs/input_formats.md +++ b/docs/input_formats.md @@ -10,7 +10,8 @@ The donor variant list file should contain a list of paths to a variant, VCF or The donor alignment list file should be similar and contain one path to a donor alignment file per file line. The files listed will be used as donor variant and alignments files respectively when constructing variant contexts. Only paths to the variant/alignment files should be provided, not the index files. These are assumed to be in the same -directory with the same name, i.e.: sample.bam & sample.bam.bai

+directory with the same name, i.e.: sample.bam & sample.bam.bai +

### Donor fastq list file @@ -22,7 +23,8 @@ _/path/to/sample1_R1.fq  /path/to/sample1_R2.fq
/path/to/sample2_R1.fq  /path/to/sample2_R2.fq
/path/to/sample3_R1.fq  /path/to/sample3_R2.fq_ -Each row in this file therefore represents a single sample.

+Each row in this file therefore represents a single sample. +

### Variant list file @@ -34,12 +36,14 @@ The variant list file needs at least three columns: sample name/identifier, chro _Sample1  1  101
Sample2  2  202
_ -(In the future two more columns might be required: variant reference allele, variant alternative allele)

+(In the future two more columns might be required: variant reference allele, variant alternative allele) +

### Variant context file A VaSeBuilder variant context file can serve as an input file when running AC, DC, FC or PC mode. Please see the output -files description for the variant context file for it's structure.

+files description for the variant context file for it's structure. +

### Configuration file diff --git a/docs/output_formats.md b/docs/output_formats.md index b440231..1b825ac 100644 --- a/docs/output_formats.md +++ b/docs/output_formats.md @@ -7,7 +7,8 @@ when running specific modes.

### Donor alignment files -A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. A list could be:
+A list of paths to donor alignment (BAM/CRAM) files used in building the variant contexts. A list could be: +

_/path/to/donor1.bam
/path/to/donor2.cram
/path/to/donor3.bam_ @@ -36,17 +37,19 @@ YYYY-MM-DD  HH:MM:SS,SSS  INFO  Running VaSeBuilder in F-mode_ ### P-mode link file When VaSeBuilder is run in P-mode, donor fastq files are outputted for each variant context. To link the variant -context file and the outputted donor fastq files, an extra file is created consisting of three columns:
- +context file and the outputted donor fastq files, an extra file is created consisting of three columns: +

_contextid_1  /path/to/fastq_R1.fq  /path/to/fastq_R1.fq
-contextid_2  /path/to/fastq_R2.fq  /path/to/fastq_R2.fq_
+contextid_2  /path/to/fastq_R2.fq  /path/to/fastq_R2.fq_ +
Each line represents a variant context. The first line in the P-mode link file is the uuid of the VaSeBuilder run. This -is used to the variant context file and the fastq files for a certain P-mode run.

+is used to the variant context file and the fastq files for a certain P-mode run. +

### Validation fastq files -Validation fastq files are valid fastq based on the provided template fastq files with acceptor and donor reads +Validation fastq files are valid fastq files based on the provided template fastq files with acceptor and donor reads exchanged based on the variant contexts. Currently (September 24th, 2019) donor reads are added at the end of the validation fastq files.

@@ -56,7 +59,7 @@ validation fastq files. The variant context file is a tab separated file containing the essential data of the variant contexts in 13 columns. This file is one of the important output files as it contains the windows created to search reads and which acceptor reads are exchanged for which donor reads. The context data is preceded by a header starting with a #. The default -output name for the variant context file is _varcon.txt_.

+output name for the variant context file is _varcon.txt_.
Variant context file columns: * __ContextId:__ The identifier of the context (consists of the chromosome name and variant positions connected with @@ -88,15 +91,23 @@ _#VBUUID: {uuid}
### Variant context statistics file The variant context statistics file contains some statistics about the reads and mates overlapping with the variant -contexts.

+contexts. Per variant context the read statistics consists, in order, of the average and median read length, q-score +and mapq values for acceptor and donor reads overlapping with the variant context. +

+_#ContextId  Avg_ALen  Avg_DLen  Med_ALen  Med_DLen  Avg_AQual  Avg_DQual  Med_AQual + Med_DQual  Avg_AMapQ  Avg_DMapQ  Med_AMapQ  Med_DMapQ
+1_100  151  151  151.0  151.0  36.5  35.6  37.0  38.0  54.6  55.7 + 60.0  60.0_ +

### Variant context bed file A BED file with the variant context entries in four columns: Chromosome name, context start, context end, context -identifier. The output name for the BED file is variantcontexts.bed. The resulting file looks like:
+identifier. The output name for the BED file is _variantcontexts.bed_. The resulting file looks like:
_1  100  1000  1_500
-2  200  2000  2_1000_

+2  200  2000  2_1000_ +

## Debug output files @@ -104,6 +115,20 @@ When debug is set to True multiple extra files are outputted as well providing m

+### Variant context acceptor/donor read positions file +Much like the acceptor/donor read positions the variant context acceptor positions and variant context donor positions +files save the leftmost R1 and rightmost R2 acceptor and donor read positions respectively. +

+ + +### Variant context unmapped acceptor/donor read mates file +Records the identifiers of reads that have an unmapped read mate in a tab separated manner. The file contains three +columns: Context identifier, sample name/identifier, read identifiers.

+ +_#ContextId  SampleId  ReadIds
+1_100 Sample1  uReadId1,uReadId2,uReadId3_ + + ### Acceptor/Donor contexts file The acceptor/donor context file is essentially a variant context file but for acceptor or donor contexts respectively. The acceptor/donor context file differs in the number of columns and only has two additional columns after ContextId - @@ -120,6 +145,10 @@ _VBUUID: {uuid}
Similar to the variant context statistics file, the acceptor/donor context statistics file contains basic statistics of the reads overlapping with acceptor or donor contexts.

+_#ContextId &empsp;Avg_ReadLen  MedReadLen  AvgReadQual  Med_ReadQual  AvgReadMapQ + MedReadMapQ
+1_100  151  151.0  37.9  39.0  57.1  60.0_ +

### Acceptor/Donor context read positions file @@ -136,17 +165,3 @@ _#ContextId  LeftPos  RightPos
Reads overlapping with the acceptor/donor contexts with unmapped read mates encountered during a VaSeBuilder run are written the the acceptor_unmapped.txt or donor_unmapped.txt

- - -### Variant context acceptor/donor read positions file -Much like the acceptor/donor read positions the variant context acceptor positions and variant context donor positions -files save the leftmost R1 and rightmost R2 acceptor and donor read positions respectively. -

- - -### Variant context unmapped acceptor/donor read mates file -Records the identifiers of reads that have an unmapped read mate in a tab separated manner. The file contains three -columns: Context identifier, sample name/identifier, read identifiers.

- -_#ContextId  SampleId  ReadIds
-1_100 Sample1  uReadId1,uReadId2,uReadId3_ From 5972bc5c25718755d40cf4774e8154b5561e3f6e Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 27 Sep 2019 15:45:13 +0200 Subject: [PATCH 47/90] Add method to semi randomly distribute donor reads, add extra seed parameters, add method to check template fastq size --- VaSeBuilder.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- vase.py | 2 ++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 7670c1a..4e2dd8c 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2028,7 +2028,7 @@ def write_pmode_linkfile(self, outpath, context_fqs_links): except IOError: self.vaselogger.warning(f"Could not write P-mode link file {plinkloc} for run {self.creation_id}") - def shuffle_donor_reads(self, donorreads, s=2): + def shuffle_donor_read_identifiers(self, donorreads, s=2): """Shuffles and returns a list of donor read identifiers. Prior to shuffling the donor read identifiers, the provided donor read list is copied to preserve the original. @@ -2040,7 +2040,7 @@ def shuffle_donor_reads(self, donorreads, s=2): donorreads : Donor read identifiers to shuffle s: int - Seed to set for shuffling + Seed to set for shuffling (default = 2) Returns ------- @@ -2054,6 +2054,33 @@ def shuffle_donor_reads(self, donorreads, s=2): random.shuffle(shuffled_donor_reads) return shuffled_donor_reads + def shuffle_donor_add_positions(self, num_of_template_reads, num_of_donor_reads, s=2): + """Shuffle positions to + + Parameters + ---------- + num_of_template_reads : int + Number of template reads in the acceptor + num_of_donor_reads : int + Number of donor reads to be added + s : int + Seed to set for shuffling (default = 2) + + Returns + ------- + shuffled_add_positions : list of int + Shuffled positions in fastq file to add donor reads to + """ + # Establish the number of possible entries + shuffled_add_positions = list(range(0, num_of_template_reads, 1)) + + # Semi randomly select a number of possible donor read insert positions + random.seed(s) + self.vaselogger.debug(f"Semi random donor add positions seed set to {s}") + random.shuffle(shuffled_add_positions) + return shuffled_add_positions + + def write_donor_output_bam(self, bamoutpath, donorreads): """Writes a set of donor reads as a bam file. @@ -2082,3 +2109,22 @@ def read_is_hard_clipped(self, fetchedread): if fetchedread.cigarstring is not None: return "H" in fetchedread.cigarstring return False + + def check_template_size(self, templatefqloc): + """Returns the number of + + Parameters + ---------- + templatefqloc : str + Path to acceptor fastq file to check + + Returns + ------- + int + Number of read entries in the template fastq file + """ + line_count = 0 + with open(templatefqloc, "r") as templatefq: + for fileline in templatefq: + line_count += 1 + return int(line_count/4) diff --git a/vase.py b/vase.py index a30b621..58d60d9 100644 --- a/vase.py +++ b/vase.py @@ -165,6 +165,8 @@ def get_vase_parameters(self): vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", help="Location to donor fastq list file") vase_argpars.add_argument("-c", "--config", dest="configfile", help="Supply a config file") + vase_argpars.add_argument("-s", "--seed", dest="seed", + help="Set seed for semi randomly distributing donor reads") vase_args = vars(vase_argpars.parse_args()) return vase_args From a5b3368cbb6261b7c8e9d0a291debbd2717f3923 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 30 Sep 2019 15:48:06 +0200 Subject: [PATCH 48/90] Start implementation of semi random donor read distribution for F-mode, add checks in ParamCheck for seed parameter --- ParamChecker.py | 23 ++- VaSeBuilder.py | 377 +++++++++++++++++++++++++++++++++--------------- vase.py | 17 ++- 3 files changed, 296 insertions(+), 121 deletions(-) diff --git a/ParamChecker.py b/ParamChecker.py index f26e2de..e272c92 100644 --- a/ParamChecker.py +++ b/ParamChecker.py @@ -56,6 +56,7 @@ def __init__(self): self.runmode = "" self.varconin = "" self.donorfqlist = "" + self.random_seed = 2 self.required_mode_parameters = {"AC": ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", "out"], "D": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"], @@ -69,7 +70,7 @@ def __init__(self): "PC": ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"], "X": ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"]} - self.optional_parameters = ["fastqout", "varcon", "variantlist"] + self.optional_parameters = ["fastqout", "varcon", "variantlist", "seed"] def required_parameters_set(self, runmode, vase_arg_vals): """Checks and returns whether all required run mode parameters have been set. @@ -344,7 +345,7 @@ def check_parameters(self, vase_arg_vals): self.vaselogger.warning("Variant list parameter used but supplied variant list file " f"{vase_arg_vals[param]} does not exist") - # Checks if the provided donor fastq list file. + # Checks if the provided donor fastq list file exists. if param == "donorfastqs": if vase_arg_vals[param] is not None: if os.path.isfile(vase_arg_vals[param]): @@ -352,6 +353,14 @@ def check_parameters(self, vase_arg_vals): else: self.vaselogger.warning("Donor fastqs parameter used but supplied donorfastq list file" f"{vase_arg_vals[param]} does not exist") + + # Checks if the provided seed value is an integer or float + if param == "seed": + if vase_arg_vals[param] is not None: + if type(vase_arg_vals[param]) is int or type(vase_arg_vals[param]) is float: + self.random_seed = vase_arg_vals[param] + else: + self.random_seed = 2 return True # Only checks whether the required runmode parameters are ok using 'check_parameters()' @@ -583,6 +592,16 @@ def get_runmode(self): """ return self.runmode + def get_random_seed_value(self): + """Returns the random seed value. + + Returns + ------- + self.random_seed : int or float + Value to use as seed for semi random distribution of donor reads of validation fastq files + """ + return self.random_seed + def get_variantcontext_infile(self): """Returns path to the variant context input file. diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 4e2dd8c..853ce53 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -38,8 +38,8 @@ def __init__(self, vaseid): self.creation_id = str(vaseid) self.creation_time = datetime.now() self.vaselogger.info( - f"VaSeBuilder: {self.creation_id} ; {self.creation_time}" - ) + f"VaSeBuilder: {self.creation_id} ; {self.creation_time}" + ) # VariantContextFile that saves the acceptor, donor, and variant contexts with their associated data. self.contexts = VariantContextFile() @@ -115,9 +115,9 @@ def build_varcon_set(self, sampleidlist, # Open the acceptor bam, or exit if this fails. try: acceptorbamfile = pysam.AlignmentFile( - acceptorbamloc, - reference_filename=reference_loc - ) + acceptorbamloc, + reference_filename=reference_loc + ) except IOError: self.vaselogger.critical("Could not open acceptor BAM file. " "Exitting.") @@ -134,9 +134,9 @@ def build_varcon_set(self, sampleidlist, if sampleid in variant_list: sample_variant_filter = variant_list[sampleid] samplevariants = self.get_sample_vcf_variants( - vcfsamplemap[sampleid], - sample_variant_filter - ) + vcfsamplemap[sampleid], + sample_variant_filter + ) # Skip this sample if all of its variants got filtered out. if not samplevariants: @@ -148,9 +148,9 @@ def build_varcon_set(self, sampleidlist, # for this sample, or skip this samples if this fails. try: bamfile = pysam.AlignmentFile( - bamsamplemap[sampleid], - reference_filename=reference_loc - ) + bamsamplemap[sampleid], + reference_filename=reference_loc + ) self.vaselogger.debug("Opened BAM file " f"{bamsamplemap[sampleid]}") except IOError: @@ -196,13 +196,13 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("dr", variantid) t0 = time.time() donor_context_reads = ( - self.get_variant_reads(variantid, - vchr, - *searchwindow, - bamfile, - True, - donor_unmapped) - ) + self.get_variant_reads(variantid, + vchr, + *searchwindow, + bamfile, + True, + donor_unmapped) + ) self.debug_msg("dr", variantid, t0) # If no donor reads were found at this position, @@ -211,7 +211,7 @@ def build_varcon_set(self, sampleidlist, self.vaselogger.info( f"No reads found for variant {variantid}" f" in donor {sampleid}; Skipping." - ) + ) continue # Determine the donor context based on @@ -220,7 +220,7 @@ def build_varcon_set(self, sampleidlist, t0 = time.time() donor_context = ( self.determine_context(donor_context_reads, vpos, vchr) - ) + ) self.debug_msg("dc", variantid, t0) # === ACCEPTOR ============================================ @@ -229,24 +229,24 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("ar", variantid) t0 = time.time() acceptor_context_reads = ( - self.get_variant_reads(variantid, - vchr, - *searchwindow, - acceptorbamfile, - True, - acceptor_unmapped) - ) + self.get_variant_reads(variantid, + vchr, + *searchwindow, + acceptorbamfile, + True, + acceptor_unmapped) + ) self.debug_msg("ar", variantid, t0) # Only perform the following if no reads were # found in the acceptor at the variant location. if not acceptor_context_reads: self.vaselogger.warning( - f"No reads found for variant {variantid} in " - "acceptor. Acceptor and donor sequencing may " - "have been performed with different methods. " - "Proceeding anyway." - ) + f"No reads found for variant {variantid} in " + "acceptor. Acceptor and donor sequencing may " + "have been performed with different methods. " + "Proceeding anyway." + ) # Create a dummy list of reads. acceptor_context_reads = None # Temporarily set acceptor context equal to donor. @@ -263,7 +263,7 @@ def build_varcon_set(self, sampleidlist, self.determine_context(acceptor_context_reads, vpos, vchr) - ) + ) self.debug_msg("ac", variantid, t0) # === COMBINED CONTEXT ==================================== @@ -273,20 +273,20 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("cc", variantid) t0 = time.time() variant_context = ( - self.determine_largest_context(vpos, - acceptor_context, - donor_context) - ) + self.determine_largest_context(vpos, + acceptor_context, + donor_context) + ) self.debug_msg("cc", variantid, t0) # If this widest context overlaps an # existing variant context, skip it. if self.contexts.context_collision(variant_context): self.vaselogger.debug( - f"Variant context {variantid} overlaps " - "with an already existing variant context; " - "Skipping." - ) + f"Variant context {variantid} overlaps " + "with an already existing variant context; " + "Skipping." + ) continue # Obtain all donor reads overlapping the @@ -294,14 +294,14 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("cdr", variantid) t0 = time.time() variant_context_donor_reads = ( - self.get_variant_reads(variantid, - variant_context[0], - variant_context[2], - variant_context[3], - bamfile, - True, - varcon_unmapped_d) - ) + self.get_variant_reads(variantid, + variant_context[0], + variant_context[2], + variant_context[3], + bamfile, + True, + varcon_unmapped_d) + ) self.debug_msg("cdr", variantid, t0) # Obtain all acceptor reads overlapping the @@ -309,14 +309,14 @@ def build_varcon_set(self, sampleidlist, self.debug_msg("car", variantid) t0 = time.time() variant_context_acceptor_reads = ( - self.get_variant_reads(variantid, - variant_context[0], - variant_context[2], - variant_context[3], - acceptorbamfile, - True, - varcon_unmapped_a) - ) + self.get_variant_reads(variantid, + variant_context[0], + variant_context[2], + variant_context[3], + acceptorbamfile, + True, + varcon_unmapped_a) + ) self.debug_msg("car", variantid, t0) # If still no acceptor reads were found in the combined @@ -327,45 +327,45 @@ def build_varcon_set(self, sampleidlist, # Add the combined, donor, and acceptor contexts along # with their reads to the current list of contexts. self.contexts.add_variant_context( - variantid, - sampleid, - *variant_context, - variant_context_acceptor_reads, - variant_context_donor_reads - ) + variantid, + sampleid, + *variant_context, + variant_context_acceptor_reads, + variant_context_donor_reads + ) self.contexts.add_donor_context( - variantid, - sampleid, - *donor_context, - donor_context_reads - ) + variantid, + sampleid, + *donor_context, + donor_context_reads + ) self.contexts.add_acceptor_context( - variantid, - sampleid, - *acceptor_context, - acceptor_context_reads - ) + variantid, + sampleid, + *acceptor_context, + acceptor_context_reads + ) # Add the read identifiers of reads with # unmapped mates. self.contexts.set_unmapped_acceptor_mate_ids( - variantid, - varcon_unmapped_a - ) + variantid, + varcon_unmapped_a + ) self.contexts.set_unmapped_donor_mate_ids( - variantid, - varcon_unmapped_d - ) + variantid, + varcon_unmapped_d + ) self.contexts.set_acceptor_context_unmapped_mate_ids( - variantid, - acceptor_unmapped - ) + variantid, + acceptor_unmapped + ) self.contexts.set_donor_context_unmapped_mate_ids( - variantid, - donor_unmapped - ) + variantid, + donor_unmapped + ) except IOError: self.vaselogger.warning("Could not obtain BAM reads from " @@ -469,7 +469,7 @@ def build_donor_from_varcon(self, varc_file, bamsamplemap, reference_loc): context.variant_context_start, context.variant_context_end, bamfile) - ) + ) # Close the sample's BAM file when done. bamfile.close() return @@ -505,8 +505,8 @@ def build_validation_set(self, run_mode, acceptor_bam, if "F" in run_mode: # Set up a set of all acceptor fastq reads to skip. skip_list = set( - self.contexts.get_all_variant_context_acceptor_read_ids() - ) + self.contexts.get_all_variant_context_acceptor_read_ids() + ) for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): # Write the fastq files. @@ -662,9 +662,9 @@ def determine_indel_read_range(self, variantpos, variantref, variantalts): """ searchstart = variantpos searchstop = variantpos + max( - max([len(x) for x in variantref.split(",")]), - max([len(x) for x in variantalts]) - ) + max([len(x) for x in variantref.split(",")]), + max([len(x) for x in variantalts]) + ) return [searchstart, searchstop] def get_variant_reads(self, contextid, variantchrom, @@ -1093,7 +1093,7 @@ def get_creation_id(self): # Returns the date the current VaSeBuilder object has been made. # def get_creation_date(self): - # return self.creation_date + # return self.creation_date def get_creation_time(self): """Returns the date and time the current VaSeBuilder object has been made. @@ -1188,22 +1188,22 @@ def write_optional_output_files(self, outpath, contextfile): self.vaselogger.debug("Writing acceptor context statistics to " f"{outpath}acceptorcontextstats.txt") contextfile.write_acceptor_context_stats( - f"{outpath}acceptorcontextstats.txt" - ) + f"{outpath}acceptorcontextstats.txt" + ) self.vaselogger.debug("Writing acceptor context read identifiers with " "unmapped mates to" f"mates to {outpath}acceptor_unmapped.txt") contextfile.write_acceptor_unmapped_mates( - f"{outpath}acceptor_unmapped.txt" - ) + f"{outpath}acceptor_unmapped.txt" + ) self.vaselogger.debug("Writing left and right most read positions of " "each acceptor context to " f"{outpath}acceptor_positions.txt") contextfile.write_acceptor_left_right_positions( - f"{outpath}acceptor_positions.txt" - ) + f"{outpath}acceptor_positions.txt" + ) # Write the optional donor context files; donor contexts, # read ids with unmapped mate and left/right positions. @@ -1224,8 +1224,8 @@ def write_optional_output_files(self, outpath, contextfile): "of each donor context to " f"{outpath}donor_positions.txt") contextfile.write_donor_left_right_positions( - f"{outpath}donor_positions.txt" - ) + f"{outpath}donor_positions.txt" + ) # Write the optional variant context files; acceptor & donor # unmapped mates and left/right positions. @@ -1233,33 +1233,33 @@ def write_optional_output_files(self, outpath, contextfile): "identifiers with unmapped mates to " f"{outpath}varcon_unmapped_acceptor.txt") contextfile.write_reads_with_unmapped_mate( - "acceptor", - f"{outpath}varcon_unmapped_acceptor.txt" - ) + "acceptor", + f"{outpath}varcon_unmapped_acceptor.txt" + ) self.vaselogger.debug("Writing variant context donor read identifiers " "with unmapped mates to " f"{outpath}varcon_unmapped_donor.txt") contextfile.write_reads_with_unmapped_mate( - "donor", - f"{outpath}varcon_unmapped_donor.txt" - ) + "donor", + f"{outpath}varcon_unmapped_donor.txt" + ) self.vaselogger.debug("Writing variant context left and right most " "read positions of acceptor reads to " f"{outpath}varcon_positions_acceptor.txt") contextfile.write_left_right_positions( - "acceptor", - f"{outpath}varcon_positions_acceptor.txt" - ) + "acceptor", + f"{outpath}varcon_positions_acceptor.txt" + ) self.vaselogger.debug("Writing variant context left and right most " "read positions of donor reads to " f"{outpath}varcon_positions_donor.txt") contextfile.write_left_right_positions( - "donor", - f"{outpath}varcon_positions_donor.txt" - ) + "donor", + f"{outpath}varcon_positions_donor.txt" + ) def write_bed_file(self, variantcontextdata, bedoutloc): """Writes variant contexts as a BED file @@ -2068,18 +2068,31 @@ def shuffle_donor_add_positions(self, num_of_template_reads, num_of_donor_reads, Returns ------- - shuffled_add_positions : list of int + add_positions : list of int Shuffled positions in fastq file to add donor reads to """ # Establish the number of possible entries shuffled_add_positions = list(range(0, num_of_template_reads, 1)) - # Semi randomly select a number of possible donor read insert positions random.seed(s) self.vaselogger.debug(f"Semi random donor add positions seed set to {s}") - random.shuffle(shuffled_add_positions) - return shuffled_add_positions - + add_positions = [] + + # Check whether the number of donor reads to add exceeds the number of acceptor reads in the template. + if num_of_donor_reads > num_of_template_reads: + self.vaselogger.debug("Found more donor reads to add than") + random_sample_size = num_of_donor_reads + while random_sample_size > 0: + pos_to_add = [] + if random_sample_size >= num_of_template_reads: + pos_to_add = random.sample(shuffled_add_positions, num_of_template_reads) + else: + pos_to_add = random.sample(shuffled_add_positions, random_sample_size) + add_positions.extend(pos_to_add) + random_sample_size = random_sample_size - num_of_template_reads + return add_positions + add_positions = random.sample(shuffled_add_positions, num_of_donor_reads) + return add_positions def write_donor_output_bam(self, bamoutpath, donorreads): """Writes a set of donor reads as a bam file. @@ -2128,3 +2141,133 @@ def check_template_size(self, templatefqloc): for fileline in templatefq: line_count += 1 return int(line_count/4) + + def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_context_reads, forward_or_reverse, + vasefq_outpath): + """Builds and writes a set of validation fastq files. + + A set of validation fastq files is build using a set of template/acceptor fastq files. Acceptor reads will be + filtered from these file via read identifier and donor reads will be added. Donor readsd reads will be added at + semi random positions. The random.seed() method is used to ensure reproducibility as using the same data (in + the same order) and the same seed results in the same shuffle. + + Parameters + ---------- + acceptorfq_filepaths : list of str + Paths to template fastq files to use + acceptorreads_toskip : list of str + Identifiers of acceptor reads to skip + donor_context_reads : list of tuple + Donor reads to add to the validation fastq files + forward_or_reverse : str + Write forward ('1') or reverse ('2') + vasefq_outpath : + Path to write VaSeBuilder validation fastq files to + """ + # Split all donor reads to add over the template fastq files + donor_read_ids = [x[0] for x in donor_context_reads] + distributed_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(acceptorfq_filepaths)) + + # Iterate over the R1/R2 fastq in files to use as templates for the + for x in range(0, len(acceptorfq_filepaths)): + # Collect the donor reads to write. + add_donor_ids = distributed_read_ids[x] + add_donor_reads = [x for x in donor_context_reads if x[0] in add_donor_ids] + + # Write the new VaSe FastQ file. + vasefq_outname = self.set_fastq_out_path(vasefq_outpath, forward_or_reverse, x + 1) + self.vaselogger.debug(f"Set FastQ output path to: {vasefq_outname}") + self.write_vase_fastq_v2(acceptorfq_filepaths[x], vasefq_outname, + acceptorreads_toskip, add_donor_reads, + forward_or_reverse) + + # Builds a new FastQ file to be used for validation. + def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, + acceptorreads_toskip, donorbamreaddata, + fr): + """Creates and writes a single VaSeBuilder validation fastq file. + + Parameters + ---------- + acceptor_infq : str + Path to + fastq_outpath : str + Path to write VaSeBuilder vaildation fastq file to + acceptorreads_toskip : list of str + Acceptor read identifiers to exclude from the validation fastq file + donorbamreaddata : list of tuple + Donor reads to writes + fr : str + Forward('1') or reverse ('2') fastq file + """ + try: + fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) + self.vaselogger.debug(f"Opened template FastQ: {acceptor_infq}") + + cur_line_index = 0 # Current read position in the template fastq + cur_add_index = 0 # Current read postion in the validation fastq + + # Determine where to semi randomly add the donor reads in the fastq + num_of_template_reads = self.check_template_size(acceptor_infq) + donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donorbamreaddata)) + donor_reads_to_addpos = self.link_donor_addpos_reads(donor_add_positions, donorbamreaddata) + + # Open the template fastq and write filtered data to a new fastq.gz file. + fqgz_infile = io.BufferedReader(gzip.open(acceptor_infq, "rb")) + for fileline in fqgz_infile: + + # Check if we are located at a read identifier. + if fileline.startswith(b"@"): + if fileline.decode("utf-8").split()[0][1:] not in acceptorreads_toskip: + fqgz_outfile.write(fileline) + fqgz_outfile.write(next(fqgz_infile)) + fqgz_outfile.write(next(fqgz_infile)) + fqgz_outfile.write(next(fqgz_infile)) + cur_add_index += 1 + + # Check if we need to add a donor read at the current position + if cur_line_index in donor_reads_to_addpos: + for donorread in donor_reads_to_addpos[cur_line_index]: + if donorread[1] == fr: + fqlines = ("@" + str(donorread[0]) + "\n" + + str(donorread[2]) + "\n" + + "+\n" + + str(donorread[3])) + fqgz_outfile.write(fqlines.encode("utf-8")) + cur_add_index += 1 + self.vaselogger.debug(f"Donor read added at {cur_add_index}") + cur_line_index += 1 + fqgz_infile.close() + + fqgz_outfile.flush() + fqgz_outfile.close() + + except IOError as ioe: + if ioe.filename == acceptor_infq: + self.vaselogger.critical("The supplied template FastQ file " + "could not be found.") + if ioe.filename == fastq_outpath: + self.vaselogger.critical("A FastQ file could not be written " + "to the provided output location.") + exit() + + def link_donor_addpos_reads(self, donor_addpos, donor_reads): + """Links and returns donor add positions and donor reads. + + Parameters + ---------- + donor_addpos : list of int + Positions in validation fastq to add donor reads at + donor_reads : list of tuple + Donor reads to add to validation fastq + + Returns + ------- + add_posread_link : dict of list + """ + add_posread_link = {} + for addpos, dread in zip(donor_addpos, donor_reads): + if addpos not in add_posread_link: + add_posread_link[addpos] = [] + add_posread_link[addpos].append(dread) + return add_posread_link diff --git a/vase.py b/vase.py index 58d60d9..ee997e7 100644 --- a/vase.py +++ b/vase.py @@ -165,7 +165,7 @@ def get_vase_parameters(self): vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", help="Location to donor fastq list file") vase_argpars.add_argument("-c", "--config", dest="configfile", help="Supply a config file") - vase_argpars.add_argument("-s", "--seed", dest="seed", + vase_argpars.add_argument("-s", "--seed", dest="seed", default=2, help="Set seed for semi randomly distributing donor reads") vase_args = vars(vase_argpars.parse_args()) return vase_args @@ -264,8 +264,21 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc): return donor_fastqs def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): - """Selects and runs the selected runmode. + """Selects and runs the selected run mode. + Depending on the specified run mode either a full validation set of fastq files or fastq files containing only + donor data are produced. If the run mode contains a 'C' an existing variant context file will be read, + otherwise it will be build. + + Parameters + ---------- + runmode : str + + vaseb : VaSeBuilder + VaSeBuilder object that will perform the actions + paramcheck : ParamChecker + Utility tha checks whether the parameters are ok + variantfilter """ vbscan = VcfBamScanner() varconfile = None # Declare the varconfile variable so we can use it in C and no-C. From f6f5fc912d9f450095ea048e283a98d4671f521e Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 3 Oct 2019 11:51:33 +0200 Subject: [PATCH 49/90] Add test method for semi random donor read distribution, add VaSeBuilder uuid to all output files (issue #62) --- VaSeBuilder.py | 135 +++++++++++++++++++++++++----------------- VariantContextFile.py | 57 ++++++++++-------- vase.py | 5 ++ 3 files changed, 119 insertions(+), 78 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 853ce53..02463ce 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1021,6 +1021,17 @@ def write_vase_fastq(self, acceptor_infq, fastq_outpath, exit() def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath): + """Build and writes a fastq file containing only donor reads. + + Parameters + ---------- + donorbamreaddata : list of tuple + Donor reads to write to fastq file + fr : str + Write forward ('1') or reverse ('2') fastq file + fastq_outpath : str + Path and name to write donor fastq file to + """ # Write the new VaSe FastQ file. # vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1) fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) @@ -1143,7 +1154,7 @@ def get_searchchrom_prefix(self, bamfile): # ===METHODS TO WRITE OUTPUT FILES========================================= def write_used_donor_files(self, outfileloc, filesamplemap, - used_donor_files): + used_donor_files, vbuuid): """Writes the donor alignment or variant files used in constructing variant contexts to an output file. Parameters @@ -1157,9 +1168,10 @@ def write_used_donor_files(self, outfileloc, filesamplemap, """ try: with open(outfileloc, "w") as outfile: + outfile.write(f"VBUUID: {vbuuid}") outfile.write("#SampleId\tDonorFile\n") for sampleid, samplefile in filesamplemap.items(): - if (samplefile in used_donor_files): + if samplefile in used_donor_files: outfile.write(f"{sampleid}\t{samplefile}\n") except IOError as ioe: self.vaselogger.critical("Could not write used donor files to " @@ -1183,74 +1195,54 @@ def write_optional_output_files(self, outpath, contextfile): # read ids with unmapped mate and left/right positions. self.vaselogger.debug("Writing acceptor contexts to " f"{outpath}acceptorcontexts.txt") - contextfile.write_acceptor_context_file(f"{outpath}acceptorcontexts.txt") + contextfile.write_acceptor_context_file(f"{outpath}acceptorcontexts.txt", self.creation_id) self.vaselogger.debug("Writing acceptor context statistics to " f"{outpath}acceptorcontextstats.txt") - contextfile.write_acceptor_context_stats( - f"{outpath}acceptorcontextstats.txt" - ) + contextfile.write_acceptor_context_stats(f"{outpath}acceptorcontextstats.txt", self.creation_id) self.vaselogger.debug("Writing acceptor context read identifiers with " "unmapped mates to" f"mates to {outpath}acceptor_unmapped.txt") - contextfile.write_acceptor_unmapped_mates( - f"{outpath}acceptor_unmapped.txt" - ) + contextfile.write_acceptor_unmapped_mates(f"{outpath}acceptor_unmapped.txt", self.creation_id) self.vaselogger.debug("Writing left and right most read positions of " "each acceptor context to " f"{outpath}acceptor_positions.txt") - contextfile.write_acceptor_left_right_positions( - f"{outpath}acceptor_positions.txt" - ) + contextfile.write_acceptor_left_right_positions(f"{outpath}acceptor_positions.txt", self.creation_id) - # Write the optional donor context files; donor contexts, - # read ids with unmapped mate and left/right positions. + # Write the optional donor context files; donor contexts, read ids with unmapped mate and left/right positions. self.vaselogger.debug("Writing donor contexts to " f"{outpath}donorcontexts.txt") - contextfile.write_donor_context_file(f"{outpath}donorcontexts.txt") + contextfile.write_donor_context_file(f"{outpath}donorcontexts.txt", self.creation_id) self.vaselogger.debug("Writing donor context statistics to " f"{outpath}donorcontextstats.txt") - contextfile.write_donor_context_stats(f"{outpath}donorcontextstats.txt") + contextfile.write_donor_context_stats(f"{outpath}donorcontextstats.txt", self.creation_id) self.vaselogger.debug("Writing donor context read identifiers " - "with unmapped mates to " - f"{outpath}donor_unmapped.txt") - contextfile.write_donor_unmapped_mates(f"{outpath}donor_unmapped.txt") + f"with unmapped mates to {outpath}donor_unmapped.txt") + contextfile.write_donor_unmapped_mates(f"{outpath}donor_unmapped.txt", self.creation_id) self.vaselogger.debug("Writing left and right most read positions " - "of each donor context to " - f"{outpath}donor_positions.txt") - contextfile.write_donor_left_right_positions( - f"{outpath}donor_positions.txt" - ) + f"of each donor context to {outpath}donor_positions.txt") + contextfile.write_donor_left_right_positions(f"{outpath}donor_positions.txt", self.creation_id) - # Write the optional variant context files; acceptor & donor - # unmapped mates and left/right positions. + # Write the optional variant context files; acceptor & donor unmapped mates and left/right positions. self.vaselogger.debug("Writing variant context acceptor read " - "identifiers with unmapped mates to " - f"{outpath}varcon_unmapped_acceptor.txt") - contextfile.write_reads_with_unmapped_mate( - "acceptor", - f"{outpath}varcon_unmapped_acceptor.txt" - ) + f"identifiers with unmapped mates to {outpath}varcon_unmapped_acceptor.txt") + contextfile.write_reads_with_unmapped_mate("acceptor", f"{outpath}varcon_unmapped_acceptor.txt", + self.creation_id) self.vaselogger.debug("Writing variant context donor read identifiers " - "with unmapped mates to " - f"{outpath}varcon_unmapped_donor.txt") - contextfile.write_reads_with_unmapped_mate( - "donor", - f"{outpath}varcon_unmapped_donor.txt" - ) + f"with unmapped mates to {outpath}varcon_unmapped_donor.txt") + contextfile.write_reads_with_unmapped_mate("donor", f"{outpath}varcon_unmapped_donor.txt", self.creation_id) self.vaselogger.debug("Writing variant context left and right most " - "read positions of acceptor reads to " - f"{outpath}varcon_positions_acceptor.txt") + f"read positions of acceptor reads to {outpath}varcon_positions_acceptor.txt") contextfile.write_left_right_positions( "acceptor", - f"{outpath}varcon_positions_acceptor.txt" + f"{outpath}varcon_positions_acceptor.txt", self.creation_id ) self.vaselogger.debug("Writing variant context left and right most " @@ -1258,10 +1250,10 @@ def write_optional_output_files(self, outpath, contextfile): f"{outpath}varcon_positions_donor.txt") contextfile.write_left_right_positions( "donor", - f"{outpath}varcon_positions_donor.txt" + f"{outpath}varcon_positions_donor.txt", self.creation_id ) - def write_bed_file(self, variantcontextdata, bedoutloc): + def write_bed_file(self, variantcontextdata, bedoutloc, vbuuid): """Writes variant contexts as a BED file Parameters @@ -1273,6 +1265,7 @@ def write_bed_file(self, variantcontextdata, bedoutloc): """ try: with open(bedoutloc, "w") as bedoutfile: + bedoutfile.write(f"VBUUID: {vbuuid}") for varcon in variantcontextdata: bedoutfile.write(f"{varcon.get_variant_context_chrom()}\t{varcon.get_variant_context_start()}\t" f"{varcon.get_variant_context_end()}\t{varcon.get_variant_context_id()}\n") @@ -1415,7 +1408,7 @@ def divide_donorfastqs_over_acceptors(self, list_of_donorfqs, num_of_acceptors): # BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING DONOR FASTQS def build_fastqs_from_donors(self, acceptor_fqin, donor_fqin, acceptor_reads_toexclude, forward_reverse, outpath): - """Builds a set of R1/R2 validatioin fastq files using already existing donor fastq files. + """Builds a set of R1/R2 validation fastq files using already existing donor fastq files. The provided acceptor fastq files are filtered and used as template for the valdiation fastq files. Already existing fastq files are added. @@ -1562,12 +1555,13 @@ def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): add_list = variantcontextfile.get_all_variant_context_donor_reads() self.vaselogger.info("Writing FastQ files.") skip_list = set(variantcontextfile.get_all_variant_context_acceptor_read_ids()) + #skip_list = list(set(variantcontextfile.get_all_variant_context_acceptor_read_ids())) for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): # Write the fastq files. self.vaselogger.info(f"Start writing the R{i} FastQ files.") fq_starttime = time.time() - self.build_fastq(fq_i, skip_list, add_list, i, fq_out) + self.build_fastq_v2(fq_i, skip_list, add_list, i, fq_out) self.vaselogger.info(f"Wrote all R{i} FastQ files.") self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") self.vaselogger.info("Finished writing FastQ files.") @@ -1695,6 +1689,11 @@ def bvcs(self, sampleidlist, vcfsamplemap, bamsamplemap, acceptorbamloc, outpath donor_bams_used.append(vcfsamplemap[sampleid]) donor_vcfs_used.append(bamsamplemap[sampleid]) + # Check if there are no variant contexts. + if variantcontexts.get_number_of_contexts() <= 0: + self.vaselogger.info("No variant contexts were created. No output files will be written.") + return None + # Write the output variant context output data self.bvcs_write_output_files(outpath, varcon_outpath, variantcontexts, vcfsamplemap, bamsamplemap, donor_vcfs_used, donor_bams_used) @@ -1993,19 +1992,21 @@ def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, v # Write the variant context statistics file self.vaselogger.info(f"Writing variant context statistics to {outpath}varconstats.txt") - variantcontextfile.write_variant_context_stats(f"{outpath}varconstats.txt") + variantcontextfile.write_variant_context_stats(f"{outpath}varconstats.txt", self.creation_id) # Write the variant contexts as a BED file self.vaselogger.info(f"Writing the variant contexts to a BED file") - self.write_bed_file(variantcontextfile.get_variant_contexts(), f"{outpath}variantcontexts.bed") + self.write_bed_file(variantcontextfile.get_variant_contexts(), f"{outpath}variantcontexts.bed", + self.creation_id) # Write a listfile of the used variant (VCF/BCF) files self.vaselogger.info(f"Writing the used donor variant files per sample to {outpath}donor_variant_files.txt") - self.write_used_donor_files(f"{outpath}donorvcfs.txt", vcfsamplemap, used_donor_vcfs) + self.write_used_donor_files(f"{outpath}donorvcfs.txt", vcfsamplemap, used_donor_vcfs, self.creation_id) # Write a listfile of the used alignment (BAM/CRAM) files self.vaselogger.info(f"Writing the used donor alignment files per sample to {outpath}donor_alignment_files.txt") - self.write_used_donor_files(f"{outpath}donor_alignment_files.txt", bamsamplemap, used_donor_bams) + self.write_used_donor_files(f"{outpath}donor_alignment_files.txt", bamsamplemap, used_donor_bams, + self.creation_id) def write_pmode_linkfile(self, outpath, context_fqs_links): """Writes the link from variant context identifier to fastq output files for P-mode. @@ -2124,7 +2125,9 @@ def read_is_hard_clipped(self, fetchedread): return False def check_template_size(self, templatefqloc): - """Returns the number of + """Returns the number of reads in the template fastq file. + + The returned number is divided by 4 as each read entry consists of four lines. Parameters ---------- @@ -2137,7 +2140,7 @@ def check_template_size(self, templatefqloc): Number of read entries in the template fastq file """ line_count = 0 - with open(templatefqloc, "r") as templatefq: + with gzip.open(templatefqloc, "r") as templatefq: for fileline in templatefq: line_count += 1 return int(line_count/4) @@ -2166,13 +2169,19 @@ def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_conte """ # Split all donor reads to add over the template fastq files donor_read_ids = [x[0] for x in donor_context_reads] + donor_read_ids.sort() + donor_read_ids = list(set(donor_read_ids)) + self.vaselogger.debug(f"Donor read ids: {donor_read_ids}") distributed_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(acceptorfq_filepaths)) + self.vaselogger.debug(f"Distributed read ids: {distributed_read_ids}") # Iterate over the R1/R2 fastq in files to use as templates for the for x in range(0, len(acceptorfq_filepaths)): # Collect the donor reads to write. add_donor_ids = distributed_read_ids[x] + self.vaselogger.debug(f"Distributed read ids: {add_donor_ids}") add_donor_reads = [x for x in donor_context_reads if x[0] in add_donor_ids] + self.vaselogger.debug(f"Will add {len(add_donor_ids)} donor reads") # Write the new VaSe FastQ file. vasefq_outname = self.set_fastq_out_path(vasefq_outpath, forward_or_reverse, x + 1) @@ -2202,18 +2211,22 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, """ try: fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) - self.vaselogger.debug(f"Opened template FastQ: {acceptor_infq}") + self.vaselogger.debug(f"Writing data to validation fastq {fastq_outpath}") cur_line_index = 0 # Current read position in the template fastq - cur_add_index = 0 # Current read postion in the validation fastq + cur_add_index = 0 # Current read position in the validation fastq # Determine where to semi randomly add the donor reads in the fastq num_of_template_reads = self.check_template_size(acceptor_infq) + self.vaselogger.debug(f"Template has {num_of_template_reads} reads") donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donorbamreaddata)) + self.vaselogger.debug(f"Add positions for {fastq_outpath} = {donor_add_positions}") donor_reads_to_addpos = self.link_donor_addpos_reads(donor_add_positions, donorbamreaddata) + self.vaselogger.debug(f"Read to add pos for {fastq_outpath} = {donor_reads_to_addpos}") # Open the template fastq and write filtered data to a new fastq.gz file. fqgz_infile = io.BufferedReader(gzip.open(acceptor_infq, "rb")) + self.vaselogger.debug(f"Opened template FastQ: {acceptor_infq}") for fileline in fqgz_infile: # Check if we are located at a read identifier. @@ -2224,18 +2237,22 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, fqgz_outfile.write(next(fqgz_infile)) fqgz_outfile.write(next(fqgz_infile)) cur_add_index += 1 + else: + self.vaselogger.debug(f"Skipping acceptor read {fileline}") # Check if we need to add a donor read at the current position if cur_line_index in donor_reads_to_addpos: + self.vaselogger.debug(f"{cur_line_index} is in list of positions to add donor") for donorread in donor_reads_to_addpos[cur_line_index]: if donorread[1] == fr: fqlines = ("@" + str(donorread[0]) + "\n" + str(donorread[2]) + "\n" + "+\n" + str(donorread[3])) + self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at " + f"{cur_line_index}") fqgz_outfile.write(fqlines.encode("utf-8")) cur_add_index += 1 - self.vaselogger.debug(f"Donor read added at {cur_add_index}") cur_line_index += 1 fqgz_infile.close() @@ -2264,10 +2281,20 @@ def link_donor_addpos_reads(self, donor_addpos, donor_reads): Returns ------- add_posread_link : dict of list + Donor reads to add per add position """ add_posread_link = {} for addpos, dread in zip(donor_addpos, donor_reads): if addpos not in add_posread_link: add_posread_link[addpos] = [] add_posread_link[addpos].append(dread) + self.vaselogger.debug(f"Added {dread[0]}/{dread[1]} at insert pos {addpos}") + return add_posread_link + + def link_donor_addpos_reads_v2(self, donor_addpos, donor_read_ids, donor_reads): + add_posread_link = {} + for addpos, dread_id in zip(donor_addpos, donor_read_ids): + if addpos not in add_posread_link: + add_posread_link[addpos] = [] + add_posread_link[addpos].append([x for x in donor_reads if x[0] == dread_id]) return add_posread_link diff --git a/VariantContextFile.py b/VariantContextFile.py index d1eda0e..6702186 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -1070,7 +1070,7 @@ def write_variant_context_file(self, outfileloc, vbuuid, samplefilter=None, self.vaselogger.warning("Could not write variant contexts to " f"{ioe.filename}") - def write_acceptor_context_file(self, outfileloc, samplefilter=None, + def write_acceptor_context_file(self, outfileloc, vbuuid, samplefilter=None, contextfilter=None, chromfilter=None): """Writes the acceptor contexts associated with variants contexts to an output file. @@ -1086,9 +1086,10 @@ def write_acceptor_context_file(self, outfileloc, samplefilter=None, Chromosome names to include """ try: - with open(outfileloc, "w") as varcon_oufile: - varcon_oufile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" - "Start\tEnd\tNumOfReads\tReadIds\n") + with open(outfileloc, "w") as varcon_outfile: + varcon_outfile.write(f"VBUUID: {vbuuid}") + varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" + "Start\tEnd\tNumOfReads\tReadIds\n") for varcon in self.variant_contexts.values(): samplepass = self.passes_filter( varcon.get_variant_context_sample(), @@ -1103,14 +1104,12 @@ def write_acceptor_context_file(self, outfileloc, samplefilter=None, chromfilter ) if samplepass and varconpass and chrompass: - varcon_oufile.write( - varcon.get_acceptor_context().to_string() + "\n" - ) + varcon_outfile.write(varcon.get_acceptor_context().to_string() + "\n") except IOError as ioe: self.vaselogger.warning("Could not write acceptor contexts to " f"{ioe.filename}") - def write_donor_context_file(self, outfileloc, samplefilter=None, + def write_donor_context_file(self, outfileloc, vbuuid, samplefilter=None, contextfilter=None, chromfilter=None): """Writes the donor contexts associated with variant context to an output file. @@ -1127,6 +1126,7 @@ def write_donor_context_file(self, outfileloc, samplefilter=None, """ try: with open(outfileloc, "w") as varcon_outfile: + varcon_outfile.write(f"VBUUID: {vbuuid}") varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" "Start\tEnd\tNumOfReads\tReadIds\n") for varcon in self.variant_contexts.values(): @@ -1150,7 +1150,7 @@ def write_donor_context_file(self, outfileloc, samplefilter=None, self.vaselogger.warning("Could not write donor contexts to " f"{ioe.filename}") - def write_variant_context_stats(self, statsoutloc): + def write_variant_context_stats(self, statsoutloc, vbuuid): """Writes basic statistics of the variant contexts to a specified output file. Basic statistics include means and medians of read lengths, Q-Score and mapping quality. @@ -1162,6 +1162,7 @@ def write_variant_context_stats(self, statsoutloc): """ try: with open(statsoutloc, "w") as varcon_statsfile: + varcon_statsfile.write(f"VBUUID: {vbuuid}") varcon_statsfile.write("#ContextId\tAvg_ALen\tAvg_DLen\t" "Med_ALen\tMed_DLen\tAvg_AQual\t" "Avg_DQual\tMed_AQual\tMed_DQual\t" @@ -1173,7 +1174,7 @@ def write_variant_context_stats(self, statsoutloc): self.vaselogger.critical("Could not write variant context " f"statistics to {statsoutloc}") - def write_acceptor_context_stats(self, statsoutloc): + def write_acceptor_context_stats(self, statsoutloc, vbuuid): """Writes basic statistics of the acceptor contexts to a specified output file. Basic statistics include means and medians of read lengths, Q-Score and mapping quality. @@ -1185,6 +1186,7 @@ def write_acceptor_context_stats(self, statsoutloc): """ try: with open(statsoutloc, "w") as varcon_statsfile: + varcon_statsfile.write(f"VBUUID: {vbuuid}") varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" "Avg_ReadQual\tMed_ReadQual\t" "Avg_ReadMapQ\tMed_ReadMapQ\n") @@ -1198,7 +1200,7 @@ def write_acceptor_context_stats(self, statsoutloc): f"statistics to {statsoutloc}") # Writes some statistics about the acceptor and donor reads identified for each variant context. - def write_donor_context_stats(self, statsoutloc): + def write_donor_context_stats(self, statsoutloc, vbuuid): """Writes basic statistics of the donor contexts to a specified output file. Basic statistics include means and medians of read lengths, Q-Score and mapping quality. @@ -1210,6 +1212,7 @@ def write_donor_context_stats(self, statsoutloc): """ try: with open(statsoutloc, "w") as varcon_statsfile: + varcon_statsfile.write(f"VBUUID: {vbuuid}") varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" "Avg_ReadQual\tMed_ReadQual\t" "Avg_ReadMapQ\tMed_ReadMapQ\n") @@ -1222,7 +1225,7 @@ def write_donor_context_stats(self, statsoutloc): self.vaselogger.critical("Coud not write donor context statistics " f"to {statsoutloc}") - def write_left_right_positions(self, typetowrite, outfileloc): + def write_left_right_positions(self, typetowrite, outfileloc, vbuuid): """Writes variant context leftmost and rightmost genomic read positions to an output file. The leftmost genomic positions of R1 reads and rightmost genomic positions for R2 reads are written to file. @@ -1236,6 +1239,7 @@ def write_left_right_positions(self, typetowrite, outfileloc): """ try: with open(outfileloc, "w") as lrpof: + lrpof.write(f"VBUUID: {vbuuid}") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions, rightpositions = [], [] @@ -1264,7 +1268,7 @@ def write_left_right_positions(self, typetowrite, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - def write_acceptor_left_right_positions(self, outfileloc): + def write_acceptor_left_right_positions(self, outfileloc, vbuuid): """Writes acceptor context leftmost and rightmost genomic read positions to an output file. The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file. @@ -1276,6 +1280,7 @@ def write_acceptor_left_right_positions(self, outfileloc): """ try: with open(outfileloc, "w") as lrpof: + lrpof.write(f"VBUUID: {vbuuid}") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions = [ @@ -1293,7 +1298,7 @@ def write_acceptor_left_right_positions(self, outfileloc): self.vaselogger.warning("Could not write read left positions to " f"output file {outfileloc}") - def write_donor_left_right_positions(self, outfileloc): + def write_donor_left_right_positions(self, outfileloc, vbuuid): """Writes donor context leftmost and rightmost genomic read positions to an output file. The leftmost genomic position of R1 reads and rightmost genomic positions for R2 reads are written to file. @@ -1305,6 +1310,7 @@ def write_donor_left_right_positions(self, outfileloc): """ try: with open(outfileloc, "w") as lrpof: + lrpof.write(f"VBUUID: {vbuuid}") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions = [ @@ -1324,7 +1330,7 @@ def write_donor_left_right_positions(self, outfileloc): # Writes the identifiers of reads that have unmapped mates per # sample to a file. Samples are all donors and the ?template?. - def write_reads_with_unmapped_mate(self, typetowrite, umfileloc): + def write_reads_with_unmapped_mate(self, typetowrite, umfileloc, vbuuid): """Writes variant context read identifiers with unmapped mates to a specified output file. Parameters @@ -1335,17 +1341,18 @@ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc): Path to write the output file to """ try: - with open(umfileloc, "w") as umFile: - umFile.write("#ContextId\tSampleId\tReadIds\n") + with open(umfileloc, "w") as umfile: + umfile.write(f"VBUUID: {vbuuid}") + umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): if typetowrite == "acceptor": - umFile.write( + umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" + ";".join(varcon.get_unmapped_acceptor_mate_ids()) ) if typetowrite == "donor": - umFile.write( + umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" + ";".join(varcon.get_unmapped_donor_mate_ids()) @@ -1355,7 +1362,7 @@ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc): "reads with unmapped mates to " f"{umfileloc}") - def write_acceptor_unmapped_mates(self, umfileloc): + def write_acceptor_unmapped_mates(self, umfileloc, vbuuid): """Writes the acceptor context read identifiers that have unmapped mates to an output file. Parameters @@ -1365,6 +1372,7 @@ def write_acceptor_unmapped_mates(self, umfileloc): """ try: with open(umfileloc, "w") as umfile: + umfile.write(f"VBUUID: {vbuuid}") umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): acccon = varcon.get_acceptor_context() @@ -1376,7 +1384,7 @@ def write_acceptor_unmapped_mates(self, umfileloc): "reads with unmapped mates to " f"{umfileloc}") - def write_donor_unmapped_mates(self, umfileloc): + def write_donor_unmapped_mates(self, umfileloc, vbuuid): """Writes the donor context read identifiers that have unmapped mates to an output file. Parameters @@ -1385,11 +1393,12 @@ def write_donor_unmapped_mates(self, umfileloc): Path to write the output file to """ try: - with open(umfileloc, "w") as umFile: - umFile.write("#ContextId\tSampleId\tReadIds\n") + with open(umfileloc, "w") as umfile: + umfile.write(f"VBUUID: {vbuuid}") + umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): doncon = varcon.get_donor_context() - umFile.write(str(doncon.get_context_id()) + "\t" + umfile.write(str(doncon.get_context_id()) + "\t" + str(doncon.get_sample_id()) + "\t" + ";".join(doncon.get_unmapped_read_mate_ids())) except IOError: diff --git a/vase.py b/vase.py index ee997e7..bc6abb9 100644 --- a/vase.py +++ b/vase.py @@ -315,6 +315,11 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): varconfile = vaseb.bvcs(sample_id_list, vcf_file_map, bam_file_map, paramcheck.get_acceptor_bam(), paramcheck.get_out_dir_location(), paramcheck.get_reference_file_location(), paramcheck.get_variant_context_out_location(), variantfilter) + + # Check whether contexts were created + if varconfile is None: + return + # Check for modes D,F,P if "D" in runmode: vaseb.run_d_mode(varconfile, paramcheck.get_fastq_out_location()) From 6af8f53d70db7dc9a0d0f92dee435c3db52b5b97 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 3 Oct 2019 13:23:18 +0200 Subject: [PATCH 50/90] Add method to add one variant context file to another --- VariantContextFile.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/VariantContextFile.py b/VariantContextFile.py index 6702186..d144f6c 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -1427,3 +1427,18 @@ def compare(self, othervarconfile, contextfilter=None): diffs = self.variant_contexts[contextid].compare(othervarconfile.get_variant_context(contextid)) varcondiffs[contextid] = diffs return varcondiffs + + def add_variant_context_file(self, variantcontextfile): + """Adds another VariantContextFile to the existing one. + + Variant contexts from the second file overlapping with variant contexts from the current file will not be added. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variant context file to add to the current + """ + for contextid, varcon in variantcontextfile.get_variant_contexts(True): + if contextid not in self.variant_contexts: + if not self.context_collision(varcon.get_context()): + self.variant_contexts[contextid] = varcon From b2af106745373f1ffdd79ef5ea9509c35b41e8ad Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 3 Oct 2019 13:52:49 +0200 Subject: [PATCH 51/90] Add small script to combine provided variant context files into one large variant context file --- combine_varcons.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 combine_varcons.py diff --git a/combine_varcons.py b/combine_varcons.py new file mode 100644 index 0000000..3d3c293 --- /dev/null +++ b/combine_varcons.py @@ -0,0 +1,44 @@ +import os +import argparse +from VariantContextFile import VariantContextFile + + +# Returns the command line parameters +def get_parameters(): + combine_varcon_params = argparse.ArgumentParser() + combine_varcon_params.add_argument("-l", "--listfile", dest="listfile", + help="Listfile containing locations to variant context files to combine") + combine_varcon_params.add_argument("-i", "--infiles", dest="infiles", + help="Variant context files to combine.") + combine_varcon_params.add_argument("-o", "--out", dest="outfile") + combine_varcon_args = vars(combine_varcon_params.parse_args()) + return combine_varcon_args + + +# Reads the variant context list file +def read_varcon_list_file(varcon_listfile): + varcon_files = [] + try: + with open(varcon_listfile, 'r') as vclf: + for fileline in vclf: + if os.path.isfile(fileline.strip()): + varcon_files.append(fileline.strip()) + except IOError: + print(f"Could not read variant context list file: {varcon_listfile}") + finally: + return varcon_files + + +# Do the actual work +cli_params = get_parameters() +varcons_to_combine = [] +if cli_params['listfile'] is not None: + varcons_to_combine = read_varcon_list_file(cli_params['listfile']) +else: + varcons_to_combine = [infile for infile in cli_params['infiles'] if os.path.isfile(infile)] + +main_varconfile = VariantContextFile(varcons_to_combine[0]) +for varconloc in varcons_to_combine[1:]: + add_varconfile = VariantContextFile(varconloc) + main_varconfile.add_variant_context_file(add_varconfile) +main_varconfile.write_variant_context_file(cli_params['outfile'], "") From 8508cb70cfa7ae1653c41f97f1bb602b96bbf0eb Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 3 Oct 2019 15:56:59 +0200 Subject: [PATCH 52/90] Change combine_varcons to be able to combine variant context files to be combined with VariantContextFile --- VariantContextFile.py | 2 +- combine_varcons.py | 45 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/VariantContextFile.py b/VariantContextFile.py index d144f6c..cf446e9 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -1438,7 +1438,7 @@ def add_variant_context_file(self, variantcontextfile): variantcontextfile : VariantContextFile Variant context file to add to the current """ - for contextid, varcon in variantcontextfile.get_variant_contexts(True): + for contextid, varcon in variantcontextfile.get_variant_contexts(True).items(): if contextid not in self.variant_contexts: if not self.context_collision(varcon.get_context()): self.variant_contexts[contextid] = varcon diff --git a/combine_varcons.py b/combine_varcons.py index 3d3c293..713a1cb 100644 --- a/combine_varcons.py +++ b/combine_varcons.py @@ -8,7 +8,7 @@ def get_parameters(): combine_varcon_params = argparse.ArgumentParser() combine_varcon_params.add_argument("-l", "--listfile", dest="listfile", help="Listfile containing locations to variant context files to combine") - combine_varcon_params.add_argument("-i", "--infiles", dest="infiles", + combine_varcon_params.add_argument("-i", "--infiles", dest="infiles", nargs="+", help="Variant context files to combine.") combine_varcon_params.add_argument("-o", "--out", dest="outfile") combine_varcon_args = vars(combine_varcon_params.parse_args()) @@ -29,6 +29,44 @@ def read_varcon_list_file(varcon_listfile): return varcon_files +def combine_varcons(varconfiles): + varcondata = {} + for varconfileloc in varconfiles: + varcondata = process_varconfile(varconfileloc, varcondata) + return varcondata + + +def process_varconfile(varconfileloc, varcondata): + with open(varconfileloc, 'r') as vrcfile: + for fileline in vrcfile: + if not fileline.startswith("#"): + varconline = fileline.strip() + filelinedata = fileline.strip().split("\t") + + if not detect_collision(filelinedata, varcondata): + varcondata[filelinedata[0]] = varconline + return varcondata + + +def detect_collision(varconentry, varcondatamap): + if varconentry[0] in varcondatamap: + return True + else: + for varcon in varcondatamap.values(): + if varcon[2] == varconentry[2]: + if int(varcon[4]) <= int(varconentry[5]) and int(varconentry[4]) <= int(varcon[5]): + return True + return False + + +def write_new_variantcontextfile(outfileloc, varcondatamap): + with open(outfileloc, 'w') as vrcoutfile: + vrcoutfile.write("#ContextId\tDonorSample\tChrom\tOrigin\tStart\tEnd\tAcceptorContextLength\t" + "DonorContextLength\tAcceptorReads\tDonorReads\tADratio\tAcceptorReadsIds\tDonorReadIds\n") + for varcondataentry in varcondatamap.values(): + vrcoutfile.write(f"{varcondataentry}\n") + + # Do the actual work cli_params = get_parameters() varcons_to_combine = [] @@ -37,8 +75,13 @@ def read_varcon_list_file(varcon_listfile): else: varcons_to_combine = [infile for infile in cli_params['infiles'] if os.path.isfile(infile)] +varcondata = combine_varcons(varcons_to_combine) +write_new_variantcontextfile(cli_params['outfile'], varcondata) + +""" main_varconfile = VariantContextFile(varcons_to_combine[0]) for varconloc in varcons_to_combine[1:]: add_varconfile = VariantContextFile(varconloc) main_varconfile.add_variant_context_file(add_varconfile) main_varconfile.write_variant_context_file(cli_params['outfile'], "") +""" From b8a7830c491636ff9a782f65a53e4ffb098816ae Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 3 Oct 2019 15:59:13 +0200 Subject: [PATCH 53/90] Remove unused code in combine_varcons --- combine_varcons.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/combine_varcons.py b/combine_varcons.py index 713a1cb..368f58c 100644 --- a/combine_varcons.py +++ b/combine_varcons.py @@ -1,6 +1,5 @@ import os import argparse -from VariantContextFile import VariantContextFile # Returns the command line parameters @@ -78,10 +77,3 @@ def write_new_variantcontextfile(outfileloc, varcondatamap): varcondata = combine_varcons(varcons_to_combine) write_new_variantcontextfile(cli_params['outfile'], varcondata) -""" -main_varconfile = VariantContextFile(varcons_to_combine[0]) -for varconloc in varcons_to_combine[1:]: - add_varconfile = VariantContextFile(varconloc) - main_varconfile.add_variant_context_file(add_varconfile) -main_varconfile.write_variant_context_file(cli_params['outfile'], "") -""" From 1133e2fca2407fe2bd45fbe513e2913ae44a7112 Mon Sep 17 00:00:00 2001 From: medinatd Date: Fri, 4 Oct 2019 14:56:07 +0200 Subject: [PATCH 54/90] Added method to fetch primary alignments of clipped secondaries. --- VaSeBuilder.py | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 02463ce..21ac03b 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -702,9 +702,27 @@ def get_variant_reads(self, contextid, variantchrom, duplicate_read_num = 0 secondary_read_num = 0 + read_objects = [] variantreads = [] + clipped_reads = [] rpnext = {} + for vread in bamfile.fetch(variantchrom, variantstart, variantend): + if vread.is_duplicate: + duplicate_read_num += 1 + if vread.is_secondary: + secondary_read_num += 1 + if self.read_is_hard_clipped(vread): + hardclipped_read_num += 1 + clipped_reads.append(vread) + continue + read_objects.append(vread) + + self.vaselogger.debug(f"Fetched {hardclipped_read_num} reads with hardclipped bases") + for clipped in clipped_reads: + read_objects.append(self.fetch_primary_from_secondary(clipped, bamfile)) + + for vread in read_objects: variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), vread.reference_name, vread.reference_start, vread.infer_read_length(), vread.reference_end, vread.cigarstring, vread.next_reference_name, @@ -714,17 +732,35 @@ def get_variant_reads(self, contextid, variantchrom, vread.mapping_quality)) rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] - if self.read_is_hard_clipped(vread): - hardclipped_read_num += 1 - if vread.is_duplicate: - duplicate_read_num += 1 - if vread.is_secondary: - secondary_read_num += 1 - self.vaselogger.debug(f"Fetched {hardclipped_read_num} reads with hardclipped bases") variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) variantreads = self.uniqify_variant_reads(variantreads) return variantreads + def fetch_primary_from_secondary(self, secondary_read, bamfile): + """Fetches and returns the primary alignment of a read based on the position recorded in its SA tag. + + The SA tag is read from a provided Pysam read object. Then, reads at the recorded alignment position are + fetched. The first non-secondary alignment with a matching read ID and pair number is returned. + + Parameters + ---------- + secondary_read : pysam.AlignedSegment + Secondary alignment whose primary alignment will be located + bamfile : pysam.AlignmentFile + Already opened pysam AlignmentFile + + Returns + ------- + primary_read : pysam.AlignedSegment + Primary alignment with the same read ID and pair number as the input secondary alignment. + """ + primary_locus = secondary_read.get_tag("SA").split(",")[0:2] + for fetched in bamfile.fetch(primary_locus[0], int(primary_locus[1]) - 1, int(primary_locus[1])): + if (fetched.query_name == secondary_read.query_name) and (fetched.is_read1 == secondary_read.is_read1) and not fetched.is_secondary: + primary_read = fetched + break + return primary_read + def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): """Fetches and returns read mates for a set of reads. From 8e105284e62b045adf039328cadc7e6a3d49b79d Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 4 Oct 2019 16:40:34 +0200 Subject: [PATCH 55/90] Semi randomly R1 and R2 reads at the same donor read add position --- VaSeBuilder.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 02463ce..f3b3365 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2187,13 +2187,13 @@ def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_conte vasefq_outname = self.set_fastq_out_path(vasefq_outpath, forward_or_reverse, x + 1) self.vaselogger.debug(f"Set FastQ output path to: {vasefq_outname}") self.write_vase_fastq_v2(acceptorfq_filepaths[x], vasefq_outname, - acceptorreads_toskip, add_donor_reads, + acceptorreads_toskip, add_donor_reads, add_donor_ids, forward_or_reverse) # Builds a new FastQ file to be used for validation. def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, acceptorreads_toskip, donorbamreaddata, - fr): + donor_readids, fr): """Creates and writes a single VaSeBuilder validation fastq file. Parameters @@ -2221,7 +2221,8 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, self.vaselogger.debug(f"Template has {num_of_template_reads} reads") donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donorbamreaddata)) self.vaselogger.debug(f"Add positions for {fastq_outpath} = {donor_add_positions}") - donor_reads_to_addpos = self.link_donor_addpos_reads(donor_add_positions, donorbamreaddata) + donor_reads_to_addpos = self.link_donor_addpos_reads_v2(donor_add_positions, donor_readids, + donorbamreaddata) self.vaselogger.debug(f"Read to add pos for {fastq_outpath} = {donor_reads_to_addpos}") # Open the template fastq and write filtered data to a new fastq.gz file. @@ -2250,7 +2251,7 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, + "+\n" + str(donorread[3])) self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at " - f"{cur_line_index}") + f"{cur_add_index}") fqgz_outfile.write(fqlines.encode("utf-8")) cur_add_index += 1 cur_line_index += 1 @@ -2296,5 +2297,5 @@ def link_donor_addpos_reads_v2(self, donor_addpos, donor_read_ids, donor_reads): for addpos, dread_id in zip(donor_addpos, donor_read_ids): if addpos not in add_posread_link: add_posread_link[addpos] = [] - add_posread_link[addpos].append([x for x in donor_reads if x[0] == dread_id]) + add_posread_link[addpos].extend([x for x in donor_reads if x[0] == dread_id]) return add_posread_link From 02dbf72fa09bd9c7ed65e56ecf896d449cf9f115 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 7 Oct 2019 11:33:26 +0200 Subject: [PATCH 56/90] Add newline to entries of unmapped read mate files --- VariantContextFile.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/VariantContextFile.py b/VariantContextFile.py index cf446e9..d353c1f 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -1349,13 +1349,13 @@ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc, vbuuid): umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" - + ";".join(varcon.get_unmapped_acceptor_mate_ids()) + + ";".join(varcon.get_unmapped_acceptor_mate_ids() + "\n") ) if typetowrite == "donor": umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" - + ";".join(varcon.get_unmapped_donor_mate_ids()) + + ";".join(varcon.get_unmapped_donor_mate_ids() + "\n") ) except IOError: self.vaselogger.warning("Could not write read identifiers of " @@ -1378,7 +1378,7 @@ def write_acceptor_unmapped_mates(self, umfileloc, vbuuid): acccon = varcon.get_acceptor_context() umfile.write(str(acccon.get_context_id()) + "\t" + str(acccon.get_sample_id()) + "\t" - + ";".join(acccon.get_unmapped_read_mate_ids())) + + ";".join(acccon.get_unmapped_read_mate_ids()) + "\n") except IOError: self.vaselogger.warning("Could not write read identifiers of " "reads with unmapped mates to " @@ -1400,7 +1400,7 @@ def write_donor_unmapped_mates(self, umfileloc, vbuuid): doncon = varcon.get_donor_context() umfile.write(str(doncon.get_context_id()) + "\t" + str(doncon.get_sample_id()) + "\t" - + ";".join(doncon.get_unmapped_read_mate_ids())) + + ";".join(doncon.get_unmapped_read_mate_ids())+"\n") except IOError: self.vaselogger.warning("Could not write read identifiers of " "reads with unmapped mates to " From 0c5c886dde2295a7ee4ff6629b4502cf9d21bb52 Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 11:33:35 +0200 Subject: [PATCH 57/90] Debug message to count FQs to be written. --- VaSeBuilder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 21ac03b..cb525d2 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1073,6 +1073,7 @@ def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath): fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) donorbamreaddata.sort(key=lambda dbr: dbr[0], reverse=False) + print(len(donorbamreaddata)) for bamread in donorbamreaddata: # Check if the BAM read is R1 or R2. if bamread[1] == fr: From 16fee5e1063fb8eb32f95e48e35926f3bb239c3f Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 7 Oct 2019 12:14:03 +0200 Subject: [PATCH 58/90] Add newline when adding donor read qualities line in write_vase_fastq_v2() --- VaSeBuilder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index f3b3365..22d7f5c 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2249,7 +2249,7 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, fqlines = ("@" + str(donorread[0]) + "\n" + str(donorread[2]) + "\n" + "+\n" - + str(donorread[3])) + + str(donorread[3]) + "\n") self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at " f"{cur_add_index}") fqgz_outfile.write(fqlines.encode("utf-8")) From 6392090e22aad421fa76811bc962a70d6663d9cf Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 12:23:20 +0200 Subject: [PATCH 59/90] Debug msg. --- VaSeBuilder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index cb525d2..f19b6a9 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -818,6 +818,7 @@ def uniqify_variant_reads(self, variantreads): if id_pair not in checklist: unique_variantreads.append(fetched) checklist.append(id_pair) + print(len(unique_variantreads)) return unique_variantreads def fetch_mate_read(self, rnext, pnext, readid, bamfile): From dc59af8d29dafd15b1e053f2067db5b542c86551 Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 12:25:18 +0200 Subject: [PATCH 60/90] Debug msg. --- VaSeBuilder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index f19b6a9..28a81e9 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -811,6 +811,7 @@ def uniqify_variant_reads(self, variantreads): unique_variantreads : list of DonorBamRead Set of reads with each read occuring only once """ + print(len(variantreads)) unique_variantreads = [] checklist = [] for fetched in variantreads: From c46b91fe6f26b949016973d9b646d1df72fa3d77 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 7 Oct 2019 12:28:08 +0200 Subject: [PATCH 61/90] Add # and newline prior and after VBUUID --- VariantContextFile.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/VariantContextFile.py b/VariantContextFile.py index d353c1f..44a0c22 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -1087,7 +1087,7 @@ def write_acceptor_context_file(self, outfileloc, vbuuid, samplefilter=None, """ try: with open(outfileloc, "w") as varcon_outfile: - varcon_outfile.write(f"VBUUID: {vbuuid}") + varcon_outfile.write(f"#VBUUID: {vbuuid}\n") varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" "Start\tEnd\tNumOfReads\tReadIds\n") for varcon in self.variant_contexts.values(): @@ -1126,7 +1126,7 @@ def write_donor_context_file(self, outfileloc, vbuuid, samplefilter=None, """ try: with open(outfileloc, "w") as varcon_outfile: - varcon_outfile.write(f"VBUUID: {vbuuid}") + varcon_outfile.write(f"#VBUUID: {vbuuid}\n") varcon_outfile.write("#ContextId\tDonorSample\tChrom\tOrigin\t" "Start\tEnd\tNumOfReads\tReadIds\n") for varcon in self.variant_contexts.values(): @@ -1162,7 +1162,7 @@ def write_variant_context_stats(self, statsoutloc, vbuuid): """ try: with open(statsoutloc, "w") as varcon_statsfile: - varcon_statsfile.write(f"VBUUID: {vbuuid}") + varcon_statsfile.write(f"#VBUUID: {vbuuid}\n") varcon_statsfile.write("#ContextId\tAvg_ALen\tAvg_DLen\t" "Med_ALen\tMed_DLen\tAvg_AQual\t" "Avg_DQual\tMed_AQual\tMed_DQual\t" @@ -1186,7 +1186,7 @@ def write_acceptor_context_stats(self, statsoutloc, vbuuid): """ try: with open(statsoutloc, "w") as varcon_statsfile: - varcon_statsfile.write(f"VBUUID: {vbuuid}") + varcon_statsfile.write(f"#VBUUID: {vbuuid}\n") varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" "Avg_ReadQual\tMed_ReadQual\t" "Avg_ReadMapQ\tMed_ReadMapQ\n") @@ -1212,7 +1212,7 @@ def write_donor_context_stats(self, statsoutloc, vbuuid): """ try: with open(statsoutloc, "w") as varcon_statsfile: - varcon_statsfile.write(f"VBUUID: {vbuuid}") + varcon_statsfile.write(f"#VBUUID: {vbuuid}\n") varcon_statsfile.write("#ContextId\tAvg_ReadLen\tMed_ReadLen\t" "Avg_ReadQual\tMed_ReadQual\t" "Avg_ReadMapQ\tMed_ReadMapQ\n") @@ -1239,7 +1239,7 @@ def write_left_right_positions(self, typetowrite, outfileloc, vbuuid): """ try: with open(outfileloc, "w") as lrpof: - lrpof.write(f"VBUUID: {vbuuid}") + lrpof.write(f"#VBUUID: {vbuuid}") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions, rightpositions = [], [] @@ -1280,7 +1280,7 @@ def write_acceptor_left_right_positions(self, outfileloc, vbuuid): """ try: with open(outfileloc, "w") as lrpof: - lrpof.write(f"VBUUID: {vbuuid}") + lrpof.write(f"#VBUUID: {vbuuid}\n") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions = [ @@ -1310,7 +1310,7 @@ def write_donor_left_right_positions(self, outfileloc, vbuuid): """ try: with open(outfileloc, "w") as lrpof: - lrpof.write(f"VBUUID: {vbuuid}") + lrpof.write(f"#VBUUID: {vbuuid}\n") lrpof.write("#ContextId\tLeftPos\tRightPos\n") for varcon in self.variant_contexts.values(): leftpositions = [ @@ -1342,7 +1342,7 @@ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc, vbuuid): """ try: with open(umfileloc, "w") as umfile: - umfile.write(f"VBUUID: {vbuuid}") + umfile.write(f"#VBUUID: {vbuuid}\n") umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): if typetowrite == "acceptor": @@ -1372,7 +1372,7 @@ def write_acceptor_unmapped_mates(self, umfileloc, vbuuid): """ try: with open(umfileloc, "w") as umfile: - umfile.write(f"VBUUID: {vbuuid}") + umfile.write(f"#VBUUID: {vbuuid}\n") umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): acccon = varcon.get_acceptor_context() @@ -1394,7 +1394,7 @@ def write_donor_unmapped_mates(self, umfileloc, vbuuid): """ try: with open(umfileloc, "w") as umfile: - umfile.write(f"VBUUID: {vbuuid}") + umfile.write(f"#VBUUID: {vbuuid}\n") umfile.write("#ContextId\tSampleId\tReadIds\n") for varcon in self.variant_contexts.values(): doncon = varcon.get_donor_context() From 85c1779844cd62a17e3b037cb499b449ee04b8b5 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 7 Oct 2019 12:57:17 +0200 Subject: [PATCH 62/90] Fix for some newlines to debug output files --- VariantContextFile.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/VariantContextFile.py b/VariantContextFile.py index 44a0c22..ea90b38 100644 --- a/VariantContextFile.py +++ b/VariantContextFile.py @@ -1349,14 +1349,14 @@ def write_reads_with_unmapped_mate(self, typetowrite, umfileloc, vbuuid): umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" - + ";".join(varcon.get_unmapped_acceptor_mate_ids() + "\n") - ) + + ";".join(varcon.get_unmapped_acceptor_mate_ids()) + + "\n") if typetowrite == "donor": umfile.write( varcon.get_variant_context_id() + "\t" + str(varcon.get_variant_context_sample()) + "\t" - + ";".join(varcon.get_unmapped_donor_mate_ids() + "\n") - ) + + ";".join(varcon.get_unmapped_donor_mate_ids()) + + "\n") except IOError: self.vaselogger.warning("Could not write read identifiers of " "reads with unmapped mates to " From 3ed95502bd99fa53d2cee21803571a67e260f934 Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 15:35:49 +0200 Subject: [PATCH 63/90] Altered method for fetching mates to check which mates are missing first. --- VaSeBuilder.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 28a81e9..bdaeaba 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -705,7 +705,9 @@ def get_variant_reads(self, contextid, variantchrom, read_objects = [] variantreads = [] clipped_reads = [] - rpnext = {} + list_r1 = [] + list_r2 = [] + #rpnext = {} for vread in bamfile.fetch(variantchrom, variantstart, variantend): if vread.is_duplicate: @@ -723,6 +725,25 @@ def get_variant_reads(self, contextid, variantchrom, read_objects.append(self.fetch_primary_from_secondary(clipped, bamfile)) for vread in read_objects: + if vread.is_read1(): + list_r1.append(vread) + elif vread.is_read2(): + list_r2.append(vread) + + list_r1_ids = [x.query_name for x in list_r1] + list_r2_ids = [x.query_name for x in list_r2] + + for r1 in list_r1: + if r1.query_name not in list_r2_ids: + list_r2.append(self.fetch_mate_read(r1.next_reference_name, r1.next_reference_start, + r1.query_name, bamfile)) + for r2 in list_r2: + if r2.query_name not in list_r1_ids: + list_r1.append(self.fetch_mate_read(r2.next_reference_name, r2.next_reference_start, + r2.query_name, bamfile)) + print(len(list_r1), len(list_r2)) + + for vread in list_r1.extend(list_r2): variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), vread.reference_name, vread.reference_start, vread.infer_read_length(), vread.reference_end, vread.cigarstring, vread.next_reference_name, @@ -730,10 +751,10 @@ def get_variant_reads(self, contextid, variantchrom, vread.template_length, vread.get_forward_sequence(), "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), vread.mapping_quality)) - rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] + # rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] - variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) - variantreads = self.uniqify_variant_reads(variantreads) + #variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) + #variantreads = self.uniqify_variant_reads(variantreads) return variantreads def fetch_primary_from_secondary(self, secondary_read, bamfile): @@ -756,7 +777,9 @@ def fetch_primary_from_secondary(self, secondary_read, bamfile): """ primary_locus = secondary_read.get_tag("SA").split(",")[0:2] for fetched in bamfile.fetch(primary_locus[0], int(primary_locus[1]) - 1, int(primary_locus[1])): - if (fetched.query_name == secondary_read.query_name) and (fetched.is_read1 == secondary_read.is_read1) and not fetched.is_secondary: + if ((fetched.query_name == secondary_read.query_name) + and (fetched.is_read1 == secondary_read.is_read1) + and not fetched.is_secondary): primary_read = fetched break return primary_read From 5b48bf5684e472cec36f1ea5a83dd293b80b9e4c Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 15:40:17 +0200 Subject: [PATCH 64/90] Fixed pysam call. --- VaSeBuilder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index bdaeaba..1f7d819 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -725,9 +725,9 @@ def get_variant_reads(self, contextid, variantchrom, read_objects.append(self.fetch_primary_from_secondary(clipped, bamfile)) for vread in read_objects: - if vread.is_read1(): + if vread.is_read1: list_r1.append(vread) - elif vread.is_read2(): + elif vread.is_read2: list_r2.append(vread) list_r1_ids = [x.query_name for x in list_r1] From 3072e18b71e56d51a56898740ea5b828bf868972 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Mon, 7 Oct 2019 15:45:01 +0200 Subject: [PATCH 65/90] Add two temporary methods for AC-mode semi random read distribution --- VaSeBuilder.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 22d7f5c..dae5f4b 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2299,3 +2299,72 @@ def link_donor_addpos_reads_v2(self, donor_addpos, donor_read_ids, donor_reads): add_posread_link[addpos] = [] add_posread_link[addpos].extend([x for x in donor_reads if x[0] == dread_id]) return add_posread_link + + def read_donor_fastq(self, donor_fastq, forward_reverse, donor_read_data): + """Reads and saves the donor fastq reads. + + Parameters + ---------- + donor_fastq : str + Path to the donor fastq file to read + forward_reverse : str + Reading forward (R1) or reverse (R2) donor read data + donor_read_data : dict + Dictionary saving the donor read data + + Returns + ------- + donor_read_data : dict + Updated dictionary containing donor read data + """ + try: + with gzip.open(donor_fastq, 'rt') as dfqfile: + for dfqfileline in dfqfile: + if dfqfileline.startswith("@"): + dfq_readid = dfqfileline.strip() + dfq_readseq = next(dfqfile).strip() + dfq_optline = next(dfqfile).strip() + dfq_readqual = next(dfqfile).strip() + if dfqfileline not in donor_read_data: + donor_read_data[dfq_readid] = [] + donor_read_data[dfq_readid].append((dfq_readid, forward_reverse, dfq_readseq, dfq_readqual)) + except IOError: + self.vaselogger.debug(f"Could not read donor fastq file {donor_fastq}") + finally: + return donor_read_data + + # Temporary new AC-mode method to test semi random read distribution + def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, outpath): + self.vaselogger.info("Running VaSeBuilder A-mode") + # Split the donor fastqs into an R1 and R2 group + r1_dfqs = [dfq[0] for dfq in dfqs] + r2_dfqs = [dfq[1] for dfq in dfqs] + + # Get the list of acceptor reads to exclude + skip_list = varconfile.get_all_variant_context_acceptor_read_ids() + skip_list.sort() + + # Read all the donor read fastq data + donor_reads = {} + for fr_i, dfq_i in zip(["1", "2"], [r1_dfqs, r2_dfqs]): + for x in range(len(dfq_i)): + donor_reads = self.read_donor_fastq(dfq_i[x], fr_i, donor_reads) + + # Divide the donor reads equally over the template fastq files + donor_read_ids = list(donor_reads.keys()) + donor_read_ids.sort() + + # Distribute the read identifiers over the fastq files + distributed_donor_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(afq1_in)) + + # Iterate over the acceptor fastq files + for fr_i, afq_i in zip(["1", "2"], [afq1_in, afq2_in]): + self.build_fastqs_from_donors_v2(afq_i, skip_list, distributed_donor_read_ids, donor_reads, fr_i, outpath) + + # Temporary new build_fastqs_from_donors() to test semi random read distribution in AC-mode + def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, distributed_donor_reads, + donor_reads, forward_reverse, outpath): + for x in range(len(acceptor_fqsin)): + donor_reads_to_add = [donor_reads[y] for y in distributed_donor_reads[x]] + self.write_vase_fastq_v2(acceptor_fqsin[x], outpath, acceptor_reads_to_exclude, donor_reads_to_add, + distributed_donor_reads[x], forward_reverse) From 503f81ecb4a5324763c187502c131ee645d9472a Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 15:47:01 +0200 Subject: [PATCH 66/90] Changed mate fetch to return pysam object instead of DBR object. --- VaSeBuilder.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 1f7d819..a158262 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -869,13 +869,7 @@ def fetch_mate_read(self, rnext, pnext, readid, bamfile): """ for bamread in bamfile.fetch(rnext, pnext, pnext + 1): if bamread.query_name == readid and bamread.reference_start == pnext: - return DonorBamRead(bamread.query_name, bamread.flag, self.get_read_pair_num(bamread), - bamread.reference_name, bamread.reference_start, bamread.infer_read_length(), - bamread.reference_end, bamread.cigarstring, bamread.next_reference_name, - bamread.next_reference_start, - bamread.template_length, bamread.get_forward_sequence(), - "".join([chr((x + 33)) for x in bamread.get_forward_qualities()]), - bamread.mapping_quality) + return bamread return None # Filters the donor reads to keep only reads that occur twice. From ebfd9646d0967142609c052259539e43d8b4f990 Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 15:54:39 +0200 Subject: [PATCH 67/90] Changed extend to addition for lists for iteration. --- VaSeBuilder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index a158262..ccedc45 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -743,7 +743,7 @@ def get_variant_reads(self, contextid, variantchrom, r2.query_name, bamfile)) print(len(list_r1), len(list_r2)) - for vread in list_r1.extend(list_r2): + for vread in list_r1 + list_r2: variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), vread.reference_name, vread.reference_start, vread.infer_read_length(), vread.reference_end, vread.cigarstring, vread.next_reference_name, From aa74641208087f2349843b09b33376e424e2944f Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 16:10:50 +0200 Subject: [PATCH 68/90] Added pair number check to mate fetching. --- VaSeBuilder.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index ccedc45..b29f1fa 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -707,7 +707,7 @@ def get_variant_reads(self, contextid, variantchrom, clipped_reads = [] list_r1 = [] list_r2 = [] - #rpnext = {} + # rpnext = {} for vread in bamfile.fetch(variantchrom, variantstart, variantend): if vread.is_duplicate: @@ -735,12 +735,12 @@ def get_variant_reads(self, contextid, variantchrom, for r1 in list_r1: if r1.query_name not in list_r2_ids: - list_r2.append(self.fetch_mate_read(r1.next_reference_name, r1.next_reference_start, - r1.query_name, bamfile)) + list_r2.append(self.fetch_mate_read(r1.query_name, r1.next_reference_name, r1.next_reference_start, + self.get_read_pair_num(r1), bamfile)) for r2 in list_r2: if r2.query_name not in list_r1_ids: - list_r1.append(self.fetch_mate_read(r2.next_reference_name, r2.next_reference_start, - r2.query_name, bamfile)) + list_r1.append(self.fetch_mate_read(r2.query_name, r2.next_reference_name, r2.next_reference_start, + self.get_read_pair_num(r2), bamfile)) print(len(list_r1), len(list_r2)) for vread in list_r1 + list_r2: @@ -751,10 +751,10 @@ def get_variant_reads(self, contextid, variantchrom, vread.template_length, vread.get_forward_sequence(), "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), vread.mapping_quality)) - # rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] + # rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] - #variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) - #variantreads = self.uniqify_variant_reads(variantreads) + # variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) + # variantreads = self.uniqify_variant_reads(variantreads) return variantreads def fetch_primary_from_secondary(self, secondary_read, bamfile): @@ -845,7 +845,7 @@ def uniqify_variant_reads(self, variantreads): print(len(unique_variantreads)) return unique_variantreads - def fetch_mate_read(self, rnext, pnext, readid, bamfile): + def fetch_mate_read(self, readid, rnext, pnext, pair_num, bamfile): """Fetches and returns the mate read of a specified read. The mate read is fetched from an opened alignment file using the RNEXT and PNEXT value of the read. Of the @@ -868,7 +868,9 @@ def fetch_mate_read(self, rnext, pnext, readid, bamfile): The mate read if found, None if not """ for bamread in bamfile.fetch(rnext, pnext, pnext + 1): - if bamread.query_name == readid and bamread.reference_start == pnext: + if (bamread.query_name == readid + and bamread.reference_start == pnext + and self.get_read_pair_num(bamread) != pair_num): return bamread return None @@ -1611,7 +1613,7 @@ def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): add_list = variantcontextfile.get_all_variant_context_donor_reads() self.vaselogger.info("Writing FastQ files.") skip_list = set(variantcontextfile.get_all_variant_context_acceptor_read_ids()) - #skip_list = list(set(variantcontextfile.get_all_variant_context_acceptor_read_ids())) + # skip_list = list(set(variantcontextfile.get_all_variant_context_acceptor_read_ids())) for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): # Write the fastq files. From 23466e7fa36d5b76328af893d9a94984654d4297 Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 16:20:39 +0200 Subject: [PATCH 69/90] Reinstated uniquifying. --- VaSeBuilder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index b29f1fa..9b8ede2 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -754,7 +754,7 @@ def get_variant_reads(self, contextid, variantchrom, # rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] # variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) - # variantreads = self.uniqify_variant_reads(variantreads) + variantreads = self.uniqify_variant_reads(variantreads) return variantreads def fetch_primary_from_secondary(self, secondary_read, bamfile): From 5d956dbc66a983549c7229213bfc097fbaee03dc Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 16:54:45 +0200 Subject: [PATCH 70/90] Remove debugging, cleaned up debug log messages. --- VaSeBuilder.py | 87 ++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 9b8ede2..abe2aa5 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -707,7 +707,6 @@ def get_variant_reads(self, contextid, variantchrom, clipped_reads = [] list_r1 = [] list_r2 = [] - # rpnext = {} for vread in bamfile.fetch(variantchrom, variantstart, variantend): if vread.is_duplicate: @@ -741,7 +740,6 @@ def get_variant_reads(self, contextid, variantchrom, if r2.query_name not in list_r1_ids: list_r1.append(self.fetch_mate_read(r2.query_name, r2.next_reference_name, r2.next_reference_start, self.get_read_pair_num(r2), bamfile)) - print(len(list_r1), len(list_r2)) for vread in list_r1 + list_r2: variantreads.append(DonorBamRead(vread.query_name, vread.flag, self.get_read_pair_num(vread), @@ -751,9 +749,7 @@ def get_variant_reads(self, contextid, variantchrom, vread.template_length, vread.get_forward_sequence(), "".join([chr((x + 33)) for x in vread.get_forward_qualities()]), vread.mapping_quality)) - # rpnext[vread.query_name] = [vread.next_reference_name, vread.next_reference_start, vread.query_name] - # variantreads = self.fetch_mates(rpnext, bamfile, variantreads, write_unm, umatelist) variantreads = self.uniqify_variant_reads(variantreads) return variantreads @@ -784,42 +780,44 @@ def fetch_primary_from_secondary(self, secondary_read, bamfile): break return primary_read - def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): - """Fetches and returns read mates for a set of reads. - - Read mates are fetched from an already opened pysam alignment file using the RNEXT and PNEXT values as well as - the read identifier. - - Parameters - ---------- - rpnextmap : dict - bamfile : pysam.AlignmentFile - Already opened pysam alingment file to fetch read mates from - variantreadlist : list of DonorBamRead - Reads to fetch read mates for - write_unm : bool - Write identifiers of reads with unmapped mates - umatelist : list of str - Identifiers with reads that have an unmapped mate - - Returns - ------- - variantreadlist : list of DonorBamRead - Updated list of reads and the added read mates - """ - hardclipped_read_num = 0 - for read1 in rpnextmap.values(): - materead = self.fetch_mate_read(*read1, bamfile) - if materead is not None: - if materead.read_has_hardclipped_bases(): - hardclipped_read_num += 1 - variantreadlist.append(materead) - else: - if write_unm: - self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.") - umatelist.append(read1[2]) - self.vaselogger.debug(f"Fetched {hardclipped_read_num} read mates with hardclipped bases") - return variantreadlist +# ============================================================================= +# def fetch_mates(self, rpnextmap, bamfile, variantreadlist, write_unm=False, umatelist=None): +# """Fetches and returns read mates for a set of reads. +# +# Read mates are fetched from an already opened pysam alignment file using the RNEXT and PNEXT values as well as +# the read identifier. +# +# Parameters +# ---------- +# rpnextmap : dict +# bamfile : pysam.AlignmentFile +# Already opened pysam alingment file to fetch read mates from +# variantreadlist : list of DonorBamRead +# Reads to fetch read mates for +# write_unm : bool +# Write identifiers of reads with unmapped mates +# umatelist : list of str +# Identifiers with reads that have an unmapped mate +# +# Returns +# ------- +# variantreadlist : list of DonorBamRead +# Updated list of reads and the added read mates +# """ +# hardclipped_read_num = 0 +# for read1 in rpnextmap.values(): +# materead = self.fetch_mate_read(*read1, bamfile) +# if materead is not None: +# if materead.read_has_hardclipped_bases(): +# hardclipped_read_num += 1 +# variantreadlist.append(materead) +# else: +# if write_unm: +# self.vaselogger.debug(f"Could not find mate for read {read1[2]} ; mate is likely unmapped.") +# umatelist.append(read1[2]) +# self.vaselogger.debug(f"Fetched {hardclipped_read_num} read mates with hardclipped bases") +# return variantreadlist +# ============================================================================= def uniqify_variant_reads(self, variantreads): """Ensures each read only occurs once and returns the updates set. @@ -834,7 +832,6 @@ def uniqify_variant_reads(self, variantreads): unique_variantreads : list of DonorBamRead Set of reads with each read occuring only once """ - print(len(variantreads)) unique_variantreads = [] checklist = [] for fetched in variantreads: @@ -842,7 +839,6 @@ def uniqify_variant_reads(self, variantreads): if id_pair not in checklist: unique_variantreads.append(fetched) checklist.append(id_pair) - print(len(unique_variantreads)) return unique_variantreads def fetch_mate_read(self, readid, rnext, pnext, pair_num, bamfile): @@ -951,10 +947,6 @@ def determine_context(self, contextreads, contextorigin, contextchr): # Set variant context as chr, min start, max end. contextstart = min(filtered_starts) contextend = max(filtered_stops) - self.vaselogger.debug("Context is " - + str(contextchr) + ", " - + str(contextstart) + ", " - + str(contextend)) return [contextchr, contextorigin, contextstart, contextend] def filter_outliers(self, pos_list, k=3): @@ -1094,7 +1086,6 @@ def build_donor_fq(self, donorbamreaddata, fr, fastq_outpath): fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) donorbamreaddata.sort(key=lambda dbr: dbr[0], reverse=False) - print(len(donorbamreaddata)) for bamread in donorbamreaddata: # Check if the BAM read is R1 or R2. if bamread[1] == fr: @@ -1929,7 +1920,7 @@ def bvcs_establish_variant_context(self, sampleid, variantcontextfile, variantid t0 = time.time() vcontext_areads = self.get_variant_reads(variantid, vcontext_window[0], vcontext_window[2], vcontext_window[3], abamfile, write_unm, unmapped_alist) - self.debug_msg("cdr", variantid, t0) + self.debug_msg("car", variantid, t0) variant_context = VariantContext(variantid, sampleid, *vcontext_window, vcontext_areads, vcontext_dreads, acontext, dcontext) From 630249afd2f717732e0df6a20d118134e3e55b50 Mon Sep 17 00:00:00 2001 From: medinatd Date: Mon, 7 Oct 2019 16:58:01 +0200 Subject: [PATCH 71/90] Changed ':' to '-' in debug msg txt. --- VaSeBuilder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index abe2aa5..92d4dad 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1860,7 +1860,7 @@ def bvcs_process_variant(self, sampleid, variantcontextfile, samplevariant, abam variantpos, acontext, dcontext, abamfile, dbamfile, write_unm) self.debug_msg("cc", variantid, t0) self.vaselogger.debug(f"Combined context determined to be {vcontext.get_variant_context_chrom()}:" - f"{vcontext.get_variant_context_start()}:{vcontext.get_variant_context_end()}") + f"{vcontext.get_variant_context_start()}-{vcontext.get_variant_context_end()}") return vcontext # Establishes a variant context from an acceptor and donor context and fetches acceptor and donor reads again. From 9120c5ac90031f6ae38093fea8e636ac093b9c9e Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 8 Oct 2019 11:00:26 +0200 Subject: [PATCH 72/90] Add some debug logging messages for semi-random read distr in AC-mode. Fix read donor fastq method --- VaSeBuilder.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index dae5f4b..c4805fa 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2325,7 +2325,7 @@ def read_donor_fastq(self, donor_fastq, forward_reverse, donor_read_data): dfq_readseq = next(dfqfile).strip() dfq_optline = next(dfqfile).strip() dfq_readqual = next(dfqfile).strip() - if dfqfileline not in donor_read_data: + if dfq_readid not in donor_read_data: donor_read_data[dfq_readid] = [] donor_read_data[dfq_readid].append((dfq_readid, forward_reverse, dfq_readseq, dfq_readqual)) except IOError: @@ -2335,7 +2335,7 @@ def read_donor_fastq(self, donor_fastq, forward_reverse, donor_read_data): # Temporary new AC-mode method to test semi random read distribution def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, outpath): - self.vaselogger.info("Running VaSeBuilder A-mode") + self.vaselogger.info("Running VaSeBuilder AC-mode") # Split the donor fastqs into an R1 and R2 group r1_dfqs = [dfq[0] for dfq in dfqs] r2_dfqs = [dfq[1] for dfq in dfqs] @@ -2349,13 +2349,16 @@ def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, outpath): for fr_i, dfq_i in zip(["1", "2"], [r1_dfqs, r2_dfqs]): for x in range(len(dfq_i)): donor_reads = self.read_donor_fastq(dfq_i[x], fr_i, donor_reads) + self.vaselogger.debug(f"ACmodeV2 Donor reads: {donor_reads}") # Divide the donor reads equally over the template fastq files donor_read_ids = list(donor_reads.keys()) donor_read_ids.sort() + self.vaselogger.debug(f"ACmodeV2 dread ids: {donor_read_ids}") # Distribute the read identifiers over the fastq files distributed_donor_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(afq1_in)) + self.vaselogger.debug(f"ACmodeV2 distr dread ids: {distributed_donor_read_ids}") # Iterate over the acceptor fastq files for fr_i, afq_i in zip(["1", "2"], [afq1_in, afq2_in]): @@ -2365,6 +2368,9 @@ def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, outpath): def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, distributed_donor_reads, donor_reads, forward_reverse, outpath): for x in range(len(acceptor_fqsin)): + fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x + 1) + self.vaselogger.debug(f"bfmdV2 xdistr dread ids: {distributed_donor_reads[x]}") donor_reads_to_add = [donor_reads[y] for y in distributed_donor_reads[x]] - self.write_vase_fastq_v2(acceptor_fqsin[x], outpath, acceptor_reads_to_exclude, donor_reads_to_add, + self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}") + self.write_vase_fastq_v2(acceptor_fqsin[x], fqoutname, acceptor_reads_to_exclude, donor_reads_to_add, distributed_donor_reads[x], forward_reverse) From 38fc27e775374009d30b87179426e8fa4216bb79 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 8 Oct 2019 13:03:58 +0200 Subject: [PATCH 73/90] Implement semi random read distribution in AC-mode --- VaSeBuilder.py | 7 ++++++- vase.py | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index c4805fa..9f20231 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2370,7 +2370,12 @@ def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, for x in range(len(acceptor_fqsin)): fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x + 1) self.vaselogger.debug(f"bfmdV2 xdistr dread ids: {distributed_donor_reads[x]}") - donor_reads_to_add = [donor_reads[y] for y in distributed_donor_reads[x]] + + # Filter the donor reads specific to the template file + selected_donor_reads = [donor_reads[y] for y in distributed_donor_reads[x]] + donor_reads_to_add = [] + [donor_reads_to_add.extend(x) for x in selected_donor_reads] + self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}") self.write_vase_fastq_v2(acceptor_fqsin[x], fqoutname, acceptor_reads_to_exclude, donor_reads_to_add, distributed_donor_reads[x], forward_reverse) diff --git a/vase.py b/vase.py index bc6abb9..b83ad21 100644 --- a/vase.py +++ b/vase.py @@ -296,9 +296,9 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): varconfile = VariantContextFile(paramcheck.get_variantcontext_infile()) if "A" in runmode: donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) - vaseb.run_ac_mode(paramcheck.get_first_fastq_in_location(), - paramcheck.get_second_fastq_in_location(), - donor_fastq_files, varconfile, paramcheck.get_fastq_out_location()) + vaseb.run_ac_mode_v2(paramcheck.get_first_fastq_in_location(), + paramcheck.get_second_fastq_in_location(), + donor_fastq_files, varconfile, paramcheck.get_fastq_out_location()) return # Refetch the donor reads required when runmode (D,F,P) contains a 'C' bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) From a924592aa124ad27045b8fa77b135eb17f8f9ca1 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 8 Oct 2019 13:31:47 +0200 Subject: [PATCH 74/90] Make variant filter empty list, not None, if sample not in variant filter file --- VaSeBuilder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 9f20231..32ff2b0 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1959,6 +1959,8 @@ def bvcs_set_variant_filter(self, sampleid, variant_list): if variant_list is not None: if sampleid in variant_list: sample_variant_filter = variant_list[sampleid] + else: + sample_variant_filter = [] return sample_variant_filter def bvcs_write_output_files(self, outpath, varcon_outpath, variantcontextfile, vcfsamplemap, bamsamplemap, From e6a60e1b47b1bf696821f7f761196dc37e890d3e Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 8 Oct 2019 13:56:55 +0200 Subject: [PATCH 75/90] Fix missed VBUUID addition to output files --- VaSeBuilder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 32ff2b0..36c0458 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1168,7 +1168,7 @@ def write_used_donor_files(self, outfileloc, filesamplemap, """ try: with open(outfileloc, "w") as outfile: - outfile.write(f"VBUUID: {vbuuid}") + outfile.write(f"#VBUUID: {vbuuid}\n") outfile.write("#SampleId\tDonorFile\n") for sampleid, samplefile in filesamplemap.items(): if samplefile in used_donor_files: @@ -1265,7 +1265,7 @@ def write_bed_file(self, variantcontextdata, bedoutloc, vbuuid): """ try: with open(bedoutloc, "w") as bedoutfile: - bedoutfile.write(f"VBUUID: {vbuuid}") + bedoutfile.write(f"#VBUUID: {vbuuid}\n") for varcon in variantcontextdata: bedoutfile.write(f"{varcon.get_variant_context_chrom()}\t{varcon.get_variant_context_start()}\t" f"{varcon.get_variant_context_end()}\t{varcon.get_variant_context_id()}\n") From 478f7e035cfdd1a566e9a79d9efc151886d6a243 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 8 Oct 2019 16:56:00 +0200 Subject: [PATCH 76/90] Implement seed parameter, change call to shuffle_donor_add_positions to use number of read ids --- VaSeBuilder.py | 22 ++++++++++++---------- vase.py | 10 ++++++---- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 36c0458..1a08d48 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1532,7 +1532,7 @@ def run_d_mode(self, variantcontextfile, fq_out): self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") self.vaselogger.info("Finished writing donor FastQ files.") - def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): + def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out, random_seed): """Runs VaSeBuilder F-mode. This run mode creates a full set of validation fastq files from established variant contexts and a set of @@ -1561,7 +1561,7 @@ def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out): # Write the fastq files. self.vaselogger.info(f"Start writing the R{i} FastQ files.") fq_starttime = time.time() - self.build_fastq_v2(fq_i, skip_list, add_list, i, fq_out) + self.build_fastq_v2(fq_i, skip_list, add_list, i, fq_out, random_seed) self.vaselogger.info(f"Wrote all R{i} FastQ files.") self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") self.vaselogger.info("Finished writing FastQ files.") @@ -2148,7 +2148,7 @@ def check_template_size(self, templatefqloc): return int(line_count/4) def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_context_reads, forward_or_reverse, - vasefq_outpath): + vasefq_outpath, random_seed): """Builds and writes a set of validation fastq files. A set of validation fastq files is build using a set of template/acceptor fastq files. Acceptor reads will be @@ -2190,12 +2190,12 @@ def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_conte self.vaselogger.debug(f"Set FastQ output path to: {vasefq_outname}") self.write_vase_fastq_v2(acceptorfq_filepaths[x], vasefq_outname, acceptorreads_toskip, add_donor_reads, add_donor_ids, - forward_or_reverse) + forward_or_reverse, random_seed) # Builds a new FastQ file to be used for validation. def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, acceptorreads_toskip, donorbamreaddata, - donor_readids, fr): + donor_readids, fr, random_seed): """Creates and writes a single VaSeBuilder validation fastq file. Parameters @@ -2221,7 +2221,8 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, # Determine where to semi randomly add the donor reads in the fastq num_of_template_reads = self.check_template_size(acceptor_infq) self.vaselogger.debug(f"Template has {num_of_template_reads} reads") - donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donorbamreaddata)) + donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donor_readids), + random_seed) self.vaselogger.debug(f"Add positions for {fastq_outpath} = {donor_add_positions}") donor_reads_to_addpos = self.link_donor_addpos_reads_v2(donor_add_positions, donor_readids, donorbamreaddata) @@ -2336,7 +2337,7 @@ def read_donor_fastq(self, donor_fastq, forward_reverse, donor_read_data): return donor_read_data # Temporary new AC-mode method to test semi random read distribution - def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, outpath): + def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, random_seed, outpath): self.vaselogger.info("Running VaSeBuilder AC-mode") # Split the donor fastqs into an R1 and R2 group r1_dfqs = [dfq[0] for dfq in dfqs] @@ -2364,11 +2365,12 @@ def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, outpath): # Iterate over the acceptor fastq files for fr_i, afq_i in zip(["1", "2"], [afq1_in, afq2_in]): - self.build_fastqs_from_donors_v2(afq_i, skip_list, distributed_donor_read_ids, donor_reads, fr_i, outpath) + self.build_fastqs_from_donors_v2(afq_i, skip_list, distributed_donor_read_ids, donor_reads, fr_i, + random_seed, outpath) # Temporary new build_fastqs_from_donors() to test semi random read distribution in AC-mode def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, distributed_donor_reads, - donor_reads, forward_reverse, outpath): + donor_reads, forward_reverse, random_seed, outpath): for x in range(len(acceptor_fqsin)): fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x + 1) self.vaselogger.debug(f"bfmdV2 xdistr dread ids: {distributed_donor_reads[x]}") @@ -2380,4 +2382,4 @@ def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}") self.write_vase_fastq_v2(acceptor_fqsin[x], fqoutname, acceptor_reads_to_exclude, donor_reads_to_add, - distributed_donor_reads[x], forward_reverse) + distributed_donor_reads[x], forward_reverse, random_seed) diff --git a/vase.py b/vase.py index b83ad21..95d70e0 100644 --- a/vase.py +++ b/vase.py @@ -75,7 +75,7 @@ def main(self): variantfilter = self.read_variant_list(pmc.get_variant_list_location()) # Run the selected mode. - self.run_selected_mode(pmc.get_runmode(), vase_b, pmc, variantfilter) + self.run_selected_mode(pmc.get_runmode(), vase_b, pmc, variantfilter, pmc.get_random_seed_value()) self.vaselogger.info("VaSeBuilder run completed succesfully.") elapsed = time.strftime( @@ -165,7 +165,7 @@ def get_vase_parameters(self): vase_argpars.add_argument("-dq", "--donorfastqs", dest="donorfastqs", help="Location to donor fastq list file") vase_argpars.add_argument("-c", "--config", dest="configfile", help="Supply a config file") - vase_argpars.add_argument("-s", "--seed", dest="seed", default=2, + vase_argpars.add_argument("-s", "--seed", dest="seed", default=2, type=int, help="Set seed for semi randomly distributing donor reads") vase_args = vars(vase_argpars.parse_args()) return vase_args @@ -232,6 +232,8 @@ def read_config_file(self, configfileloc): # Check if the parameter is the debug parameter elif parameter_name == "debug": configdata[parameter_name] = parameter_value.title() in debug_param_vals + elif parameter_name == "seed": + configdata[parameter_name] = int(parameter_value) else: configdata[parameter_name] = parameter_value.strip() except IOError: @@ -263,7 +265,7 @@ def read_donor_fastq_list_file(self, donorfq_listfileloc): finally: return donor_fastqs - def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): + def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter, randomseed): """Selects and runs the selected run mode. Depending on the specified run mode either a full validation set of fastq files or fastq files containing only @@ -298,7 +300,7 @@ def run_selected_mode(self, runmode, vaseb, paramcheck, variantfilter): donor_fastq_files = self.read_donor_fastq_list_file(paramcheck.get_donorfqlist()) vaseb.run_ac_mode_v2(paramcheck.get_first_fastq_in_location(), paramcheck.get_second_fastq_in_location(), - donor_fastq_files, varconfile, paramcheck.get_fastq_out_location()) + donor_fastq_files, varconfile, randomseed, paramcheck.get_fastq_out_location()) return # Refetch the donor reads required when runmode (D,F,P) contains a 'C' bam_file_map = vbscan.scan_bamcram_files(paramcheck.get_valid_bam_filelist()) From 7f880f05a8a9b4f278f6ba3d8b3f70c066f6c39b Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 9 Oct 2019 09:48:00 +0200 Subject: [PATCH 77/90] Small fix for donor read identifiers in AC-mode --- VaSeBuilder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 1a08d48..d207847 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2324,7 +2324,7 @@ def read_donor_fastq(self, donor_fastq, forward_reverse, donor_read_data): with gzip.open(donor_fastq, 'rt') as dfqfile: for dfqfileline in dfqfile: if dfqfileline.startswith("@"): - dfq_readid = dfqfileline.strip() + dfq_readid = dfqfileline[1:].strip() dfq_readseq = next(dfqfile).strip() dfq_optline = next(dfqfile).strip() dfq_readqual = next(dfqfile).strip() From d4d5600ed5e9754bd48daa614474a1535ce99fe9 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Wed, 9 Oct 2019 11:19:18 +0200 Subject: [PATCH 78/90] Change log message to properly display the position for each added donor read --- VaSeBuilder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index d207847..f97738a 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2253,10 +2253,10 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, + str(donorread[2]) + "\n" + "+\n" + str(donorread[3]) + "\n") - self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at " - f"{cur_add_index}") fqgz_outfile.write(fqlines.encode("utf-8")) cur_add_index += 1 + self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at " + f"{cur_add_index}") cur_line_index += 1 fqgz_infile.close() From cee2ac4ff1224690fa8117ebee5d8b2fc7561b96 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 10 Oct 2019 15:27:29 +0200 Subject: [PATCH 79/90] Use mor efficient range in shuffle_donor_add_positions, add method to write insert positions file --- VaSeBuilder.py | 64 +++++++++++++++++++++++++++++++++------- tests/TestVaSeBuilder.py | 20 +++++++++++++ 2 files changed, 74 insertions(+), 10 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index f97738a..7dcb0ea 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2074,9 +2074,6 @@ def shuffle_donor_add_positions(self, num_of_template_reads, num_of_donor_reads, add_positions : list of int Shuffled positions in fastq file to add donor reads to """ - # Establish the number of possible entries - shuffled_add_positions = list(range(0, num_of_template_reads, 1)) - random.seed(s) self.vaselogger.debug(f"Semi random donor add positions seed set to {s}") add_positions = [] @@ -2088,13 +2085,13 @@ def shuffle_donor_add_positions(self, num_of_template_reads, num_of_donor_reads, while random_sample_size > 0: pos_to_add = [] if random_sample_size >= num_of_template_reads: - pos_to_add = random.sample(shuffled_add_positions, num_of_template_reads) + pos_to_add = random.sample(range(0, num_of_template_reads), num_of_template_reads) else: - pos_to_add = random.sample(shuffled_add_positions, random_sample_size) + pos_to_add = random.sample(range(0, ), random_sample_size) add_positions.extend(pos_to_add) random_sample_size = random_sample_size - num_of_template_reads return add_positions - add_positions = random.sample(shuffled_add_positions, num_of_donor_reads) + add_positions = random.sample(range(0, num_of_template_reads), num_of_donor_reads) return add_positions def write_donor_output_bam(self, bamoutpath, donorreads): @@ -2223,10 +2220,10 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, self.vaselogger.debug(f"Template has {num_of_template_reads} reads") donor_add_positions = self.shuffle_donor_add_positions(num_of_template_reads, len(donor_readids), random_seed) - self.vaselogger.debug(f"Add positions for {fastq_outpath} = {donor_add_positions}") + # self.vaselogger.debug(f"Add positions for {fastq_outpath} = {donor_add_positions}") donor_reads_to_addpos = self.link_donor_addpos_reads_v2(donor_add_positions, donor_readids, donorbamreaddata) - self.vaselogger.debug(f"Read to add pos for {fastq_outpath} = {donor_reads_to_addpos}") + # self.vaselogger.debug(f"Read to add pos for {fastq_outpath} = {donor_reads_to_addpos}") # Open the template fastq and write filtered data to a new fastq.gz file. fqgz_infile = io.BufferedReader(gzip.open(acceptor_infq, "rb")) @@ -2246,7 +2243,7 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, # Check if we need to add a donor read at the current position if cur_line_index in donor_reads_to_addpos: - self.vaselogger.debug(f"{cur_line_index} is in list of positions to add donor") + # self.vaselogger.debug(f"{cur_line_index} is in list of positions to add donor") for donorread in donor_reads_to_addpos[cur_line_index]: if donorread[1] == fr: fqlines = ("@" + str(donorread[0]) + "\n" @@ -2292,7 +2289,7 @@ def link_donor_addpos_reads(self, donor_addpos, donor_reads): if addpos not in add_posread_link: add_posread_link[addpos] = [] add_posread_link[addpos].append(dread) - self.vaselogger.debug(f"Added {dread[0]}/{dread[1]} at insert pos {addpos}") + # self.vaselogger.debug(f"Added {dread[0]}/{dread[1]} at insert pos {addpos}") return add_posread_link def link_donor_addpos_reads_v2(self, donor_addpos, donor_read_ids, donor_reads): @@ -2383,3 +2380,50 @@ def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}") self.write_vase_fastq_v2(acceptor_fqsin[x], fqoutname, acceptor_reads_to_exclude, donor_reads_to_add, distributed_donor_reads[x], forward_reverse, random_seed) + + def write_donor_insert_positions_v2(self, inserted_position_data, outpath): + """Writes the insert positions for each set of reads. + + Insert positions are written per read identifier. + + Parameters + ---------- + inserted_position_data : dict + Position data + outpath : str + Path and name to write the donor read insert position data to + """ + try: + with open(outpath, "w") as ipd_outfile: + ipd_outfile.write(f"#VBUUID: {self.creation_id}\n") + ipd_outfile.write("ReadId\tR1_InsertPos\tFastqR1Out\tR2_InsertPos\tFastqR2Out\n") + + # Iterate over the donor read insert position data and write to file + for fastqout in inserted_position_data: + readid_list = [x for x in inserted_position_data[fastqout]] + readid_list.sort() + + # Iterate over each read identifier for the specified output R1 and R2 fastq set. + for readid in readid_list: + r1_insertpos = self.get_saved_insert_position("1", inserted_position_data[fastqout][readid]) + r2_insertpos = self.get_saved_insert_position("2", inserted_position_data[fastqout][readid]) + + ipd_outfile.write(f"{readid}\t{r1_insertpos}\t{fastqout}_R1.fastq\t{r2_insertpos}\t" + f"{fastqout}_R2.fastq\n") + except IOError: + self.vaselogger.warning(f"Could not write donor insert position data to {outpath}") + + def get_saved_insert_position(self, readpn, read_insert_data): + """ + + Parameters + ---------- + readpn : str + :param read_insert_data: + :return: + """ + if readpn in read_insert_data: + readpn_index = read_insert_data.index(readpn) + if readpn_index < len(read_insert_data)-1: + return read_insert_data[read_insert_data.index(readpn)+1] + return "NA" diff --git a/tests/TestVaSeBuilder.py b/tests/TestVaSeBuilder.py index e2c977f..6a68ece 100644 --- a/tests/TestVaSeBuilder.py +++ b/tests/TestVaSeBuilder.py @@ -6,6 +6,7 @@ from unittest.mock import patch, mock_open, call import os import sys +import random import pysam # Import required class @@ -230,3 +231,22 @@ def test_divide_remaining_donors(self): cordivdonors = [['aR1.fq', 'bR1.fq', 'eR1.fq'], ['cR1.fq', 'dR1.fq']] self.assertListEqual(self.vs_builder.divide_remaining_donors(divdonors), cordivdonors, f"The correct divided donor fastq list should have been {cordivdonors}") + + # Tests that the shuffling of donor read works as expected + def test_shuffle_donor_read_identifiers(self): + read_ids = ["Read4", "Read3", "Read5", "Read1", "Read2"] + + shuffled_answer = ["Read3", "Read2", "Read4", "Read5", "Read1"] + shuffled_answer = read_ids.copy() + shuffled_answer.sort() + random.seed(2) + random.shuffle(shuffled_answer) + + self.assertListEqual(self.vs_builder.shuffle_donor_read_identifiers(read_ids, 2), shuffled_answer, + f"The correct shuffled donor read ids should have been {shuffled_answer}") + + def test_shuffle_donor_add_positions(self): + add_positions = [6, 8, 2, 0, 1] + + shuffled_answer = add_positions.copy() + From c52adfc30868584d72942433030cedc2e628a04d Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 10 Oct 2019 16:06:49 +0200 Subject: [PATCH 80/90] Add some tests for get_saved_insert_positions --- VaSeBuilder.py | 12 +++++++++--- tests/TestVaSeBuilder.py | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 7dcb0ea..534e956 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2414,13 +2414,19 @@ def write_donor_insert_positions_v2(self, inserted_position_data, outpath): self.vaselogger.warning(f"Could not write donor insert position data to {outpath}") def get_saved_insert_position(self, readpn, read_insert_data): - """ + """Returns the insert position of a read, or 'NA' if not available. Parameters ---------- readpn : str - :param read_insert_data: - :return: + The readpair number ('1' or '2') + read_insert_data : tuple of str and int + The read insert data + + Returns + ------- + int or str + The insert position, 'NA' if not available """ if readpn in read_insert_data: readpn_index = read_insert_data.index(readpn) diff --git a/tests/TestVaSeBuilder.py b/tests/TestVaSeBuilder.py index 6a68ece..675d327 100644 --- a/tests/TestVaSeBuilder.py +++ b/tests/TestVaSeBuilder.py @@ -250,3 +250,26 @@ def test_shuffle_donor_add_positions(self): shuffled_answer = add_positions.copy() + def test_get_saved_insert_position_r1(self): + read_data = ('1', 100, '2', 100) + r1pos_answer = 100 + self.assertEqual(self.vs_builder.get_saved_insert_position('1', read_data), r1pos_answer, + f"The R1 read insert position should have been {r1pos_answer}") + + def test_get_saved_insert_position_r2(self): + read_data = ('1', 100, '2', 150) + r2pos_answer = 150 + self.assertEqual(self.vs_builder.get_saved_insert_position('2', read_data), r2pos_answer, + f"The R2 read insert position should have been {r2pos_answer}") + + def test_get_saved_insert_position_nor1(self): + read_data = ('2', 150) + r1pos_answer = 'NA' + self.assertEqual(self.vs_builder.get_saved_insert_position('1', read_data), r1pos_answer, + "The R1 read insert position should have been NA") + + def test_get_saved_insert_position_nor2(self): + read_data = ('1', 100) + r2pos_answer = 'NA' + self.assertEqual(self.vs_builder.get_saved_insert_position('2', read_data), r2pos_answer, + "The R2 read insert position should have been NA") From e5706f234f0f5a5eee201bbce715131b557ad326 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 11 Oct 2019 14:12:10 +0200 Subject: [PATCH 81/90] Implement writing donor insert positions to output file --- VaSeBuilder.py | 81 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 12 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 534e956..731b41c 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -1555,16 +1555,18 @@ def run_f_mode(self, variantcontextfile, fq1_in, fq2_in, fq_out, random_seed): add_list = variantcontextfile.get_all_variant_context_donor_reads() self.vaselogger.info("Writing FastQ files.") skip_list = set(variantcontextfile.get_all_variant_context_acceptor_read_ids()) - #skip_list = list(set(variantcontextfile.get_all_variant_context_acceptor_read_ids())) + # skip_list = list(set(variantcontextfile.get_all_variant_context_acceptor_read_ids())) + donor_read_add_data = {} for i, fq_i in zip(["1", "2"], [fq1_in, fq2_in]): # Write the fastq files. self.vaselogger.info(f"Start writing the R{i} FastQ files.") fq_starttime = time.time() - self.build_fastq_v2(fq_i, skip_list, add_list, i, fq_out, random_seed) + self.build_fastq_v2(fq_i, skip_list, add_list, i, fq_out, random_seed, donor_read_add_data) self.vaselogger.info(f"Wrote all R{i} FastQ files.") self.vaselogger.debug(f"Writing R{i} FastQ file(s) took {time.time() - fq_starttime} seconds.") self.vaselogger.info("Finished writing FastQ files.") + self.write_donor_insert_positions_v2(donor_read_add_data, f"{fq_out}_donor_read_insert_positions.txt") def run_p_mode(self, variantcontextfile, outpath, fq_out): """Runs VaSeBuilder P-mode. @@ -2145,7 +2147,7 @@ def check_template_size(self, templatefqloc): return int(line_count/4) def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_context_reads, forward_or_reverse, - vasefq_outpath, random_seed): + vasefq_outpath, random_seed, donor_read_insert_data): """Builds and writes a set of validation fastq files. A set of validation fastq files is build using a set of template/acceptor fastq files. Acceptor reads will be @@ -2185,14 +2187,19 @@ def build_fastq_v2(self, acceptorfq_filepaths, acceptorreads_toskip, donor_conte # Write the new VaSe FastQ file. vasefq_outname = self.set_fastq_out_path(vasefq_outpath, forward_or_reverse, x + 1) self.vaselogger.debug(f"Set FastQ output path to: {vasefq_outname}") + + # Check whether to add the fastq file name into the donor read insert position map + if vasefq_outname.split(".")[0][:-3] not in donor_read_insert_data: + donor_read_insert_data[vasefq_outname.split(".")[0][:-3]] = {} + self.write_vase_fastq_v2(acceptorfq_filepaths[x], vasefq_outname, acceptorreads_toskip, add_donor_reads, add_donor_ids, - forward_or_reverse, random_seed) + forward_or_reverse, random_seed, donor_read_insert_data) # Builds a new FastQ file to be used for validation. def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, acceptorreads_toskip, donorbamreaddata, - donor_readids, fr, random_seed): + donor_readids, fr, random_seed, donor_read_insert_data): """Creates and writes a single VaSeBuilder validation fastq file. Parameters @@ -2208,6 +2215,7 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, fr : str Forward('1') or reverse ('2') fastq file """ + fastq_prefix = fastq_outpath.split(".")[0][:-3] try: fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb")) self.vaselogger.debug(f"Writing data to validation fastq {fastq_outpath}") @@ -2254,6 +2262,8 @@ def write_vase_fastq_v2(self, acceptor_infq, fastq_outpath, cur_add_index += 1 self.vaselogger.debug(f"Added donor read {donorread[0]}/{donorread[1]} at " f"{cur_add_index}") + self.add_donor_insert_data(fastq_prefix, donorread[0], fr, cur_add_index, + donor_read_insert_data) cur_line_index += 1 fqgz_infile.close() @@ -2349,37 +2359,60 @@ def run_ac_mode_v2(self, afq1_in, afq2_in, dfqs, varconfile, random_seed, outpat for fr_i, dfq_i in zip(["1", "2"], [r1_dfqs, r2_dfqs]): for x in range(len(dfq_i)): donor_reads = self.read_donor_fastq(dfq_i[x], fr_i, donor_reads) - self.vaselogger.debug(f"ACmodeV2 Donor reads: {donor_reads}") + # self.vaselogger.debug(f"ACmodeV2 Donor reads: {donor_reads}") # Divide the donor reads equally over the template fastq files donor_read_ids = list(donor_reads.keys()) donor_read_ids.sort() - self.vaselogger.debug(f"ACmodeV2 dread ids: {donor_read_ids}") + # self.vaselogger.debug(f"ACmodeV2 dread ids: {donor_read_ids}") # Distribute the read identifiers over the fastq files distributed_donor_read_ids = self.divide_donorfastqs_over_acceptors(donor_read_ids, len(afq1_in)) - self.vaselogger.debug(f"ACmodeV2 distr dread ids: {distributed_donor_read_ids}") + # self.vaselogger.debug(f"ACmodeV2 distr dread ids: {distributed_donor_read_ids}") # Iterate over the acceptor fastq files + donor_read_add_data = {} for fr_i, afq_i in zip(["1", "2"], [afq1_in, afq2_in]): self.build_fastqs_from_donors_v2(afq_i, skip_list, distributed_donor_read_ids, donor_reads, fr_i, - random_seed, outpath) + random_seed, outpath, donor_read_add_data) + self.write_donor_insert_positions_v2(donor_read_add_data, f"{outpath}_donor_read_insert_positions.txt") # Temporary new build_fastqs_from_donors() to test semi random read distribution in AC-mode def build_fastqs_from_donors_v2(self, acceptor_fqsin, acceptor_reads_to_exclude, distributed_donor_reads, - donor_reads, forward_reverse, random_seed, outpath): + donor_reads, forward_reverse, random_seed, outpath, donor_read_insert_data): + """Builds a set of R1/R2 validation fastq files using existing donor fastq files. + + Parameters + ---------- + acceptor_fqsin: list of str + Template/Acceptor fastq files to use + acceptor_reads_to_exclude: list of str + distributed_donor_reads : list of list of str + donor_reads: dict + forward_reverse: str + Whether to construct an R1 or R2 fastq file + random_seed: int + Random seed to use for shuffling donor add positions + outpath: str + Path top write produced fastq file to + donor_read_insert_data: + """ for x in range(len(acceptor_fqsin)): fqoutname = self.set_fastq_out_path(outpath, forward_reverse, x + 1) self.vaselogger.debug(f"bfmdV2 xdistr dread ids: {distributed_donor_reads[x]}") + # Add the fastq file entry to the insert positions map + if fqoutname.split(".")[0][:-3] not in donor_read_insert_data: + donor_read_insert_data[fqoutname.split(".")[0][:-3]] = {} + # Filter the donor reads specific to the template file selected_donor_reads = [donor_reads[y] for y in distributed_donor_reads[x]] donor_reads_to_add = [] [donor_reads_to_add.extend(x) for x in selected_donor_reads] - self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}") + # self.vaselogger.debug(f"bfmdV2 dread ids to add: {donor_reads_to_add}") self.write_vase_fastq_v2(acceptor_fqsin[x], fqoutname, acceptor_reads_to_exclude, donor_reads_to_add, - distributed_donor_reads[x], forward_reverse, random_seed) + distributed_donor_reads[x], forward_reverse, random_seed, donor_read_insert_data) def write_donor_insert_positions_v2(self, inserted_position_data, outpath): """Writes the insert positions for each set of reads. @@ -2433,3 +2466,27 @@ def get_saved_insert_position(self, readpn, read_insert_data): if readpn_index < len(read_insert_data)-1: return read_insert_data[read_insert_data.index(readpn)+1] return "NA" + + def add_donor_insert_data(self, fqoutname, readid, forward_reverse, insertpos, donor_insert_data): + """ + + Parameters + ---------- + fqoutname : str + Name of fastq file + readid : str + Read identifier for which to save the insert position + forward_reverse : str + Indicator whether the read is R1 or R2 + insertpos : int + Position the donor read was inserted at in the validation fastq + donor_insert_data: dict + Saved donor read insertions into validation fastq + """ + if fqoutname in donor_insert_data: + if readid in donor_insert_data[fqoutname]: + if forward_reverse not in donor_insert_data[fqoutname][readid]: + donor_insert_data[fqoutname][readid] = donor_insert_data[fqoutname][readid] + \ + (forward_reverse, insertpos) + else: + donor_insert_data[fqoutname][readid] = (forward_reverse, insertpos) From 65b49901716ce86422be35c517a7bf303c3115c4 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 11 Oct 2019 15:40:18 +0200 Subject: [PATCH 82/90] Update unittests for DonorBamRead --- tests/TestDonorBamRead.py | 57 ++++++++++++++++++++++++++++++++++----- tests/TestVaSeBuilder.py | 1 - 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/tests/TestDonorBamRead.py b/tests/TestDonorBamRead.py index 0c8a672..3a1f3f5 100644 --- a/tests/TestDonorBamRead.py +++ b/tests/TestDonorBamRead.py @@ -6,17 +6,25 @@ class TestDonorBamRead(unittest.TestCase): # Creates the answer variables and the DonorBamRead used for testing def setUp(self): self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522" + self.read_flag_answer = "143" self.read_pn_answer = "1" self.read_chrom_answer = "21" self.read_pos_answer = 9411193 self.read_len_answer = 151 + self.read_end_answer = 9411331 + self.read_cigarstring_answer = "13S138M" + self.read_rnext_answer = "21" + self.read_pnext_answer = 9411300 + self.read_tlen_answer = 4398 self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \ "CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA" self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \ ">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>" self.read_map_q_answer = 40 - self.dbam_read = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.dbam_read = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_map_q_answer) # ====================PERFORM THE TESTS FOR THE GETTER METHODS==================== @@ -24,12 +32,16 @@ def test_get_bam_read_id(self): self.assertEqual(self.dbam_read.get_bam_read_id(), self.read_id_answer, "Both BAM read identifiers should have " f"been {self.read_id_answer}") + def tet_get_bam_read_flag(self): + self.assertEqual(self.dbam_read.get_bam_read_flag(), self.read_flag_answer, + f"Both read flag should have been {self.read_flag_answer}") + def test_get_bam_read_pair_number(self): - self.assertEquals(self.dbam_read.get_bam_read_pair_number(), self.read_pn_answer, "Both read pair numbers " + self.assertEqual(self.dbam_read.get_bam_read_pair_number(), self.read_pn_answer, "Both read pair numbers " f"should have been {self.read_pn_answer}") def test_get_bam_read_chrom(self): - self.assertEquals(self.dbam_read.get_bam_read_chrom(), self.read_chrom_answer, "Both read chromosomes should " + self.assertEqual(self.dbam_read.get_bam_read_chrom(), self.read_chrom_answer, "Both read chromosomes should " f"have been {self.read_chrom_answer}") def test_get_bam_read_ref_pos(self): @@ -41,9 +53,28 @@ def test_get_bam_read_length(self): f"been {self.read_len_answer}") def test_get_bam_read_ref_end(self): - read_end_answer = 9411344 - self.assertEqual(self.dbam_read.get_bam_read_ref_end(), read_end_answer, "Both read end positions should have " - f"been {read_end_answer}") + self.assertEqual(self.dbam_read.get_bam_read_ref_end(), self.read_end_answer, "Both read end positions should have " + f"been {self.read_end_answer}") + + def test_get_bam_read_cigar(self): + self.assertEqual(self.dbam_read.get_bam_read_cigar(), self.read_cigarstring_answer, + f"Both read CIGAR strings should have been {self.read_cigarstring_answer}") + + def test_get_bam_read_rnext(self): + self.assertEqual(self.dbam_read.get_bam_read_rnext(), self.read_rnext_answer, + f"Both read RNEXT values should have been {self.read_rnext_answer}") + + def test_get_bam_read_pnext(self): + self.assertEqual(self.dbam_read.get_bam_read_pnext(), self.read_pnext_answer, + f"Both read PNEXT values should have been {self.read_pnext_answer}") + + def test_get_bam_read_tlen(self): + self.assertEqual(self.dbam_read.get_bam_read_tlen(), self.read_tlen_answer, + f"Both read TLEN values should have been {self.read_tlen_answer}") + + def test_get_bam_read_sequence(self): + self.assertEqual(self.dbam_read.get_bam_read_sequence(), self.read_seq_answer, + f"Both read sequences should have been {self.read_seq_answer}") def test_get_bam_read_qual(self): self.assertEqual(self.dbam_read.get_bam_read_qual(), self.read_quals_answer, "Both read qualities should have " @@ -64,6 +95,10 @@ def test_get_bam_read_mapq(self): self.assertEqual(self.dbam_read.get_mapping_qual(), self.read_map_q_answer, "Both MapQ values should have been " f"{self.read_map_q_answer}") + def test_read_has_hardclipped_bases(self): + self.assertFalse(self.dbam_read.read_has_hardclipped_bases(), + f"Read with {self.read_cigarstring_answer} should not have hard clipped bases.") + # ====================PERFORM THE TESTS FOR THE STATISTICS METHODS==================== def test_get_average_q_score(self): avg_qscore_answer = 28.490066225165563 @@ -100,3 +135,11 @@ def test_get_as_fast_q_seq_nopairnum(self): fastq_answer = f"@{self.read_id_answer}\n{self.read_seq_answer}\n+\n{self.read_quals_answer}\n" self.assertEqual(self.dbam_read.get_as_fastq_seq(), fastq_answer, "Both answers should have been " f"{fastq_answer}") + + def test_get_as_bam_read(self): + bamcram_entry = f"{self.read_id_answer}\t{self.read_flag_answer}\t{self.read_chrom_answer}\t" \ + f"{self.read_pos_answer}\t{self.read_map_q_answer}\t{self.read_cigarstring_answer}\t" \ + f"{self.read_rnext_answer}\t{self.read_pnext_answer}\t{self.read_tlen_answer}\t{self.read_seq_answer}" \ + f"{self.read_quals_answer}" + self.assertEqual(self.dbam_read.get_as_bam_read(), bamcram_entry, + f"The returned entry shoud have been {bamcram_entry}") diff --git a/tests/TestVaSeBuilder.py b/tests/TestVaSeBuilder.py index 675d327..51dc3cf 100644 --- a/tests/TestVaSeBuilder.py +++ b/tests/TestVaSeBuilder.py @@ -247,7 +247,6 @@ def test_shuffle_donor_read_identifiers(self): def test_shuffle_donor_add_positions(self): add_positions = [6, 8, 2, 0, 1] - shuffled_answer = add_positions.copy() def test_get_saved_insert_position_r1(self): From 6c6646b06bc58b37a07186ab69a6740599cccea7 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 11 Oct 2019 15:49:27 +0200 Subject: [PATCH 83/90] Update unittests for VcfVariant --- tests/TestVcfVariant.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/TestVcfVariant.py b/tests/TestVcfVariant.py index 911ea80..768f5e2 100644 --- a/tests/TestVcfVariant.py +++ b/tests/TestVcfVariant.py @@ -8,7 +8,7 @@ def setUp(self): self.variant_pos_answer = 100 self.variant_ref_answer = "C" self.variant_alts_answer = ("A", "G") - self.variant_filter_answer = ["PASS"] + self.variant_filter_answer = "PASS" self.variant_type_answer = "snp" self.variant_id_answer = f"{self.variant_chrom_answer}_{self.variant_pos_answer}" self.vcfvariant_obj_answer = VcfVariant(self.variant_chrom_answer, self.variant_pos_answer, @@ -58,14 +58,14 @@ def test_set_variant_alt_alleles(self): f"alleles should have been: {varalts_toset}") def test_get_variant_filter(self): - self.assertListEqual(self.vcfvariant_obj_answer.get_variant_filter(), self.variant_filter_answer, "Both variant" - f" filters should have been: {self.variant_filter_answer}") + self.assertEqual(self.vcfvariant_obj_answer.get_variant_filter(), self.variant_filter_answer, + f"Both variant filters should have been: {self.variant_filter_answer}") def test_set_variant_filter(self): - varfilter_touse = ['q10'] + varfilter_touse = "q10" self.vcfvariant_obj_answer.set_variant_filter(varfilter_touse) - self.assertListEqual(self.vcfvariant_obj_answer.get_variant_filter(), varfilter_touse, "The set variant filter " - f"should have been: {varfilter_touse}") + self.assertEqual(self.vcfvariant_obj_answer.get_variant_filter(), varfilter_touse, "The set variant filter " + f"should have been: {varfilter_touse}") def test_get_variant_type(self): self.assertEqual(self.vcfvariant_obj_answer.get_variant_type(), self.variant_type_answer, "Both variant types " From d02374891c63fe7f91524b3a09d47120867a744a Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 11 Oct 2019 16:00:17 +0200 Subject: [PATCH 84/90] Small updates to docstrings --- DonorBamRead.py | 18 +++++++++--------- OverlapContext.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/DonorBamRead.py b/DonorBamRead.py index 00a118f..b501ca1 100644 --- a/DonorBamRead.py +++ b/DonorBamRead.py @@ -92,7 +92,7 @@ def get_bam_read_id(self): Returns ------- - bam_read_id : str + self.bam_read_id : str The read identifier """ return self.bam_read_id @@ -112,7 +112,7 @@ def get_bam_read_pair_number(self): Returns ------- - bam_read_pairnum : str + self.bam_read_pairnum : str The read pair number """ return self.bam_read_pairnum @@ -122,7 +122,7 @@ def get_bam_read_chrom(self): Returns ------- - bam_read_chrom : str + self.bam_read_chrom : str The chromosome name where the read is mapped """ return self.bam_read_chrom @@ -132,7 +132,7 @@ def get_bam_read_ref_pos(self): Returns ------- - bam_read_ref_pos : int + self.bam_read_ref_pos : int The read leftmost genomic position """ return self.bam_read_ref_pos @@ -143,7 +143,7 @@ def get_bam_read_length(self): Returns ------- - bam_read_length : int + self.bam_read_length : int The length of the read """ return self.bam_read_length @@ -202,7 +202,7 @@ def get_bam_read_sequence(self): Returns ------- - bam_read_seq : str + self.bam_read_seq : str The sequence of the read """ return self.bam_read_seq @@ -212,7 +212,7 @@ def get_bam_read_qual(self): Returns ------- - bam_read_qual : str + self.bam_read_qual : str Read quality """ return self.bam_read_qual @@ -223,7 +223,7 @@ def get_bam_read_q_scores(self): Returns ------- - qscores : list + qscores : list of int Q-Score of the read """ qscores = [] @@ -236,7 +236,7 @@ def get_mapping_qual(self): Returns ------- - bam_read_map_qual : int + self.bam_read_map_qual : int The mapping quality (MAPQ) """ return self.bam_read_map_qual diff --git a/OverlapContext.py b/OverlapContext.py index fe3f102..e0863ca 100644 --- a/OverlapContext.py +++ b/OverlapContext.py @@ -21,7 +21,7 @@ class OverlapContext: The leftmost genomic position of the context context_end : int The rightmost genomic position of the context - context_bam_reads : list of str + context_bam_reads : list of DonorBamRead List of aligned reads unmapped_read_mate_ids : list of str List of read identifiers that have unmapped mates From ae9f51a2c7b554ec963f83050637896f11d8926a Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Fri, 11 Oct 2019 16:25:32 +0200 Subject: [PATCH 85/90] Add small test --- tests/TestOverlapContext.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/TestOverlapContext.py b/tests/TestOverlapContext.py index 6f12d6f..06898e1 100644 --- a/tests/TestOverlapContext.py +++ b/tests/TestOverlapContext.py @@ -55,6 +55,12 @@ def setUp(self): self.avg_med_read_mapq_answer = [40, 40] # ====================PERFORM THE TESTS FOR THE GETTER METHODS==================== + def test_get_context(self): + context_answer = [self.context_chrom_answer, self.context_origin_answer, self.context_start_answer, + self.context_end_answer] + self.assertListEqual(self.overlap_context.get_context(), context_answer, + f"The returned context array should have been {context_answer}") + def test_get_context_id(self): self.assertEqual(self.overlap_context.get_context_id(), self.context_id_answer, "The context id should have " f"been {self.context_id_answer}") From 9adbe9e4f70a6c41bf6ccd7da510d9f96a4246b4 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 15 Oct 2019 11:02:20 +0200 Subject: [PATCH 86/90] Updates to unittests for OverlapContext --- tests/TestOverlapContext.py | 57 ++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/tests/TestOverlapContext.py b/tests/TestOverlapContext.py index 06898e1..371ca54 100644 --- a/tests/TestOverlapContext.py +++ b/tests/TestOverlapContext.py @@ -8,23 +8,35 @@ class TestOverlapContext(unittest.TestCase): def setUp(self): # Create three DonorBamRead objects to add to the overlap context self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522" + self.read_flag_answer = "143" self.read_pn_answer = "1" self.read_chrom_answer = "21" self.read_pos_answer = 9411193 self.read_len_answer = 151 + self.read_end_answer = 9411331 + self.read_cigarstring_answer = "13S138M" + self.read_rnext_answer = "21" + self.read_pnext_answer = 9411300 + self.read_tlen_answer = 4398 self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \ "CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA" self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \ ">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>" self.read_mapq_answer = 40 - self.con_read1 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.con_read1 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_mapq_answer) - self.con_read2 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.con_read2 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_mapq_answer) - self.con_read3 = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.con_read3 = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_mapq_answer) # Create the overlap context to test @@ -36,7 +48,7 @@ def setUp(self): self.context_end_answer = 9411344 self.context_len_answer = 151 self.context_bam_reads_answer = [self.con_read1, self.con_read2, self.con_read3] - self.unmapped_answer = [] + self.unmapped_answer = ["uread_1", "uread_2", "uread_3"] self.overlap_context = OverlapContext(self.context_id_answer, self.context_sample_answer, self.context_chrom_answer, self.context_origin_answer, self.context_start_answer, self.context_end_answer, @@ -154,10 +166,10 @@ def test_get_context_bam_read_qualities(self): self.assertEqual(self.overlap_context.get_context_bam_read_qualities(), context_read_quals_answer, "The read " f"qualites should have been {context_read_quals_answer}") - #def test_getContextBamReadQScores(self): - #contextReadQScoresAnswer = self.qscoreAnswer + self.qscoreAnswer + self.qscoreAnswer - #self.assertListEqual(self.overlapContext.getContextBamReadQScores(), contextReadQScoresAnswer, - # f"The Q-scores should have been {contextReadQScoresAnswer}") + def test_get_context_bam_read_q_scores(self): + context_read_q_scores_answer = [self.qscore_answer] + [self.qscore_answer] + [self.qscore_answer] + self.assertListEqual(self.overlap_context.get_context_bam_read_q_scores(), context_read_q_scores_answer, + f"The Q-scores should have been {context_read_q_scores_answer}") def test_get_context_bam_read_map_qs(self): context_read_map_qs_answer = [40, 40, 40] @@ -173,6 +185,29 @@ def test_read_is_in_context_neg(self): self.assertFalse(self.overlap_context.read_is_in_context(read_in_context_answer_neg), "Read id " f"{read_in_context_answer_neg} should not have been in the context") + # ====================PERFORM THE TESTS FOR UNMAPPED MATE IDS==================== + def test_add_unmapped_mate_id(self): + read_id_to_add = "uReadId1" + self.overlap_context.add_unmapped_mate_id(read_id_to_add) + self.assertTrue(read_id_to_add in self.overlap_context.unmapped_read_mate_ids, + f"Read id {read_id_to_add} should have been in the list of reads with unmapped mates") + + def test_set_unmapped_mate_ids(self): + unmapped_read_ids = ["uReadId1", "uReadId2", "uReadId3"] + self.overlap_context.set_unmapped_mate_ids(unmapped_read_ids) + self.assertListEqual(self.overlap_context.unmapped_read_mate_ids, unmapped_read_ids, + f"The list of unmapped read mate ids should have been {unmapped_read_ids}") + + def test_read_has_unmapped_mate_true(self): + read_id_to_search = "uread_1" + self.assertTrue(self.overlap_context.read_has_unmapped_mate(read_id_to_search), + f"Read {read_id_to_search} should have an unmapped mate") + + def test_read_has_unmapped_mate_false(self): + read_id_to_search = "fReadId1" + self.assertFalse(self.overlap_context.read_has_unmapped_mate(read_id_to_search), + f"Read {read_id_to_search} should not have an unmapped mate") + # ====================PERFORM THE TESTS FOR THE OVERLAP CONTEXT STATISTICS==================== def test_get_average_and_median_read_length(self): self.assertListEqual(self.overlap_context.get_average_and_median_read_length(), self.avg_med_read_len_answer, From 7529957642e9257310ad5c4b670ad2e108b33f7f Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 15 Oct 2019 16:21:05 +0200 Subject: [PATCH 87/90] Add some tests for VaSeBuilder and correct DonorBamRead creation for VariantContext tests --- VaSeBuilder.py | 14 ++++++ tests/TestVaSeBuilder.py | 92 +++++++++++++++++++++++++++---------- tests/TestVariantContext.py | 19 ++++++-- 3 files changed, 97 insertions(+), 28 deletions(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index 731b41c..b52b0c4 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -2303,6 +2303,20 @@ def link_donor_addpos_reads(self, donor_addpos, donor_reads): return add_posread_link def link_donor_addpos_reads_v2(self, donor_addpos, donor_read_ids, donor_reads): + """Links and returns donor add positions and donor reads. + + Parameters + ---------- + donor_addpos : list of int + Positions in validation fastq to add donor reads at + donor_reads : list of tuple + Donor reads to add to validation fastq + + Returns + ------- + add_posread_link : dict of list + Donor reads to add per add position + """ add_posread_link = {} for addpos, dread_id in zip(donor_addpos, donor_read_ids): if addpos not in add_posread_link: diff --git a/tests/TestVaSeBuilder.py b/tests/TestVaSeBuilder.py index 51dc3cf..4157740 100644 --- a/tests/TestVaSeBuilder.py +++ b/tests/TestVaSeBuilder.py @@ -27,6 +27,7 @@ def setUp(self): self.filter_to_use = ["aap", "noot", "mies"] self.vcfvarlist = [] + # ====================PERFORM THE TESTS FOR THE FILTER METHOD==================== # Tests that a value is indeed in the inclusion filter def test_passes_filter_posincl(self): val_to_use = "noot" @@ -59,7 +60,7 @@ def test_get_sample_vcf_variants_pos(self): variant_list_answer.sort() obtained_variant_list = self.vs_builder.get_sample_vcf_variants(vcffile) - obtained_list = [x.to_strng() for x in obtained_variant_list] + obtained_list = [x.to_string() for x in obtained_variant_list] obtained_list.sort() self.assertListEqual(obtained_list, variant_list_answer, "The obtained list of reads should have been " @@ -84,7 +85,7 @@ def test_determine_variant_type_indel(self): # Tests that an indel is determined correctly def test_determine_variant_type_indel2(self): variant_ref = "C,CGATC" - variant_alts = ("C") + variant_alts = ("C",) variant_type_answer = "indel" self.assertEqual(self.vs_builder.determine_variant_type(variant_ref, variant_alts), variant_type_answer, "The determined variant type should have been an indel") @@ -178,24 +179,27 @@ def test_is_required_read_neg(self): "Should have been false for forward read") # Tests setting the fastq output path for the F (R1) file - def test_setFastqOutPath(self): - output_path = "/aap/noot/mies" - fastq_out_name = "" + def test_set_fastq_out_path_r1(self): + outprefix = "/test/outdata/VaSe" + lanenum = 3 + outpath_answer = f"{outprefix}_{datetime.now().date()}_L{lanenum}_R1.fastq" + self.assertEqual(self.vs_builder.set_fastq_out_path(outprefix, '1', lanenum), outpath_answer, + f"The returned output path for R1 should have been {outpath_answer}") + + def test_set_fastq_out_path_r2(self): + outprefix = "/test/outdata/VaSe" + lanenum = 3 + outpath_answer = f"{outprefix}_{datetime.now().date()}_L{lanenum}_R2.fastq" + self.assertEqual(self.vs_builder.set_fastq_out_path(outprefix, '2', lanenum), outpath_answer, + f"The returned output path for R2 should have been {outpath_answer}") # Tests that the creation id of the VaSeBuilder object is set correctly - def test_getCreationId(self): + def test_get_creation_id(self): creationid_answer = "piet" vasebuilder_obj = VaSeBuilder(creationid_answer) self.assertEqual(vasebuilder_obj.get_creation_id(), creationid_answer, f"The returned VaSeBuilder identifier should have been: {creationid_answer}") - # Tests that the creation date of the VaSeBuilder is set correctly. - def test_get_creation_date(self): - creation_data_answer = datetime.now().date() - vasebuilder_obj = VaSeBuilder("aap") - self.assertEqual(vasebuilder_obj.get_creation_date(), creation_data_answer, - f"The returned VaSeBuilder creation data should have been: {creation_data_answer}") - # Tests that a saved context for a specified variant is indeed returned. def test_get_variant_context_pos(self): self.vs_builder.variantContextMap = self.var_con_map @@ -212,26 +216,20 @@ def test_get_variant_context_neg(self): # = # Tests that a list of donor is divided without a remainder list - def test_divide_donorfastqs_over_acceptors_noremainder(self): + def test_divide_donorfastqs_over_acceptors_equal(self): donorfq_list = ['aR1.fq', 'bR1.fq', 'cR1.fq', 'dR1.fq', 'eR1.fq', 'fR1.fq'] - divlist_answer = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq', 'fR1.fq']] + divlist_answer = [['fR1.fq', 'cR1.fq'], ['eR1.fq', 'bR1.fq'], ['dR1.fq', 'aR1.fq']] self.assertListEqual(self.vs_builder.divide_donorfastqs_over_acceptors(donorfq_list, 3), divlist_answer, f"The returned divided donor fastq list should have been {divlist_answer}") # Tests that a list of donors is divided with a remainder list - def test_divide_donorfastqs_over_acceptors_withremainder(self): + def test_divide_donorfastqs_over_acceptors_unequal(self): donorfq_list = ['aR1.fq', 'bR1.fq', 'cR1.fq', 'dR1.fq', 'eR1.fq'] - divlist_answer = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq']] + divlist_answer = [['eR1.fq', 'cR1.fq', 'aR1.fq'], ['dR1.fq', 'bR1.fq']] self.assertListEqual(self.vs_builder.divide_donorfastqs_over_acceptors(donorfq_list, 2), divlist_answer, f"The returned divided donor fastq list should have been {divlist_answer}") - # Tests that the remainders are added to the proper lists. - def test_divide_remaining_donors(self): - divdonors = [['aR1.fq', 'bR1.fq'], ['cR1.fq', 'dR1.fq'], ['eR1.fq']] - cordivdonors = [['aR1.fq', 'bR1.fq', 'eR1.fq'], ['cR1.fq', 'dR1.fq']] - self.assertListEqual(self.vs_builder.divide_remaining_donors(divdonors), cordivdonors, - f"The correct divided donor fastq list should have been {cordivdonors}") - + # ====================PERFORM TESTS FOR THE SHUFFLING METHODS==================== # Tests that the shuffling of donor read works as expected def test_shuffle_donor_read_identifiers(self): read_ids = ["Read4", "Read3", "Read5", "Read1", "Read2"] @@ -272,3 +270,49 @@ def test_get_saved_insert_position_nor2(self): r2pos_answer = 'NA' self.assertEqual(self.vs_builder.get_saved_insert_position('2', read_data), r2pos_answer, "The R2 read insert position should have been NA") + + # Adds a donor insert data position for a new fastq + def test_add_donor_insert_data_addnewfq(self): + donor_add_data = {} + fq_name = "ifq.fastq" + read_id = "iReadId1" + read_pairnum = "1" + insert_pos = 100 + donor_add_data_answer = {"ifq.fastq": [("1", 100)]} + self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data) + self.assertDictEqual(donor_add_data, donor_add_data_answer, + f"Both donor read add data should have been {donor_add_data_answer}") + + # Tests adding a new donor insert position data for an existing fastq + def test_add_donor_insert_data_addnewread(self): + donor_add_data = {"ifq.fastq": []} + fq_name = "ifq.fastq" + read_id = "iReadId1" + read_pairnum = "1" + insert_pos = 100 + donor_add_data_answer = {"ifq.fastq": [("1", 100)]} + self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data) + self.assertDictEqual(donor_add_data, donor_add_data_answer, + f"Both donor read add data should have been {donor_add_data_answer}") + + def test_add_donor_insert_data_addadditional(self): + donor_add_data = {"ifq.fastq": [("1", 100)]} + fq_name = "ifq.fastq" + read_id = "iReadId1" + read_pairnum = "2" + insert_pos = 100 + donor_add_data_answer = {"ifq.fastq": [("1", 100, "2", 100)]} + self.vs_builder.add_donor_insert_data(fq_name, read_id, read_pairnum, insert_pos, donor_add_data) + self.assertDictEqual(donor_add_data, donor_add_data_answer, + f"Both donor read add data should have been {donor_add_data_answer}") + + def test_link_donor_addpos_reads_v2(self): + addposlist = [6, 8] + readidlist = ['dRead1', 'dRead2'] + donor_reads = [('dRead1', '1', 'ACAT', '<<>>'), ('dRead1', '2', 'TACA', '>><<'), + ('dRead2', '1', 'CAAT', '<><>'), ('dRead2', '2', 'TAAC', '><><')] + donor_addpos_link_answer = {6: [('dRead1', '1', 'ACAT', '<<>>'), ('dRead1', '2', 'TACA', '>><<')], + 8: [('dRead2', '1', 'CAAT', '<><>'), ('dRead2', '2', 'TAAC', '><><')]} + self.assertDictEqual(self.vs_builder.link_donor_addpos_reads_v2(addposlist, readidlist, donor_reads), + donor_addpos_link_answer, + f"Both donor read/add position link maps should have been {donor_addpos_link_answer}") diff --git a/tests/TestVariantContext.py b/tests/TestVariantContext.py index c24d87f..265e332 100644 --- a/tests/TestVariantContext.py +++ b/tests/TestVariantContext.py @@ -9,22 +9,33 @@ def setUp(self): # Create the variables saving the context BAM reads self.read_id_answer = "HHKY2CCXX160108:1:2122:24160:2522" self.read_donor_id_answer = "HHKY2CCXX160108:1:2122:24160:2555" + self.read_flag_answer = "143" self.read_pn_answer = "1" self.read_donor_pn_answer = "2" self.read_chrom_answer = "21" self.read_pos_answer = 9411193 self.read_len_answer = 151 self.read_donor_len_answer = 108 + self.read_end_answer = 9411334 + self.read_cigarstring_answer = "13S138M" + self.read_donor_cigarstring_answer = "43H108M" + self.read_rnext_answer = "21" + self.read_pnext_answer = 9511300 + self.read_tlen_answer = 400 self.read_seq_answer = "AGAAAAAGTCTTTAGATGGGATCTTCCTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGTCCAATCAGA" \ "CTGAAATGCCTTGAGGCTAGATTTCAGTCTTTGTGGCAGCTGGTGAATTTCTAGTTTGCCTTTTCA" self.read_quals_answer = "><=???>==<=====<====<=<==<==<=====<============<==<========<=====<=<==<==>>==>>>>=>" \ ">==>>=>>>>>>>>>=>>>>>>=>>>=>>>=>>>>>?????????>=>>???>??????@@@?><:8>" self.read_map_q_answer = 40 - self.acceptor_read = DonorBamRead(self.read_id_answer, self.read_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_len_answer, self.read_seq_answer, + self.acceptor_read = DonorBamRead(self.read_id_answer, self.read_flag_answer, self.read_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_len_answer, + self.read_end_answer, self.read_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_map_q_answer) - self.donor_read = DonorBamRead(self.read_donor_id_answer, self.read_donor_pn_answer, self.read_chrom_answer, - self.read_pos_answer, self.read_donor_len_answer, self.read_seq_answer, + self.donor_read = DonorBamRead(self.read_donor_id_answer, self.read_flag_answer, self.read_donor_pn_answer, + self.read_chrom_answer, self.read_pos_answer, self.read_donor_len_answer, + self.read_end_answer, self.read_donor_cigarstring_answer, self.read_rnext_answer, + self.read_pnext_answer, self.read_tlen_answer, self.read_seq_answer, self.read_quals_answer, self.read_map_q_answer) self.acceptor_read_ids_answer = ["HHKY2CCXX160108:1:2122:24160:2522"] From 9ebb1408e6636e06b0e9af0447dbe9b487674560 Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Tue, 15 Oct 2019 17:02:23 +0200 Subject: [PATCH 88/90] Add some tests for the ParamChecker --- tests/TestParamChecker.py | 52 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/TestParamChecker.py b/tests/TestParamChecker.py index e4c42a8..bb520d9 100644 --- a/tests/TestParamChecker.py +++ b/tests/TestParamChecker.py @@ -355,3 +355,55 @@ def test_required_parameters_set_invalidxcmodeparam(self): set_parameters = {} self.assertFalse(self.param_check.required_parameters_set("XC", set_parameters), "The required parameters for XC mode should not have been ok and therefore return False") + + # =====TEST SOME OTHER METHODS===== + def test_get_required_runmode_parameters_acmode(self): + ac_params_answer = ["runmode", "templatefq1", "templatefq2", "donorfastqs", "varconin", "out"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("AC"), ac_params_answer, + f"The list of required AC-mode parameters should have been {ac_params_answer}") + + def test_get_required_runmode_parameters_dmode(self): + d_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("D"), d_params_answer, + f"The list of required D-mode parameters should have been {d_params_answer}") + + def test_get_required_runmode_parameters_dcmode(self): + dc_params_answer = ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"] + self.assertListEqual(self.param_check.get_required_runmode_parameters(""), dc_params_answer, + f"The list of required DC-mode parameters should have been {dc_params_answer}") + + def test_get_required_runmode_parameters_fmode(self): + f_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "templatefq1", "templatefq2", "out", + "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("F"), f_params_answer, + f"The list of required F-mode parameters should have been {f_params_answer}") + + def test_get_required_runmode_parameters_fcmode(self): + fc_params_answer = ["runmode", "donorvcf", "donorbam", "templatefq1", "templatefq2", "out", "reference", + "varconin"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("FC"), fc_params_answer, + f"the list of required FC-mode parameters should have been {fc_params_answer}") + + def test_get_required_runmode_parameters_pmode(self): + p_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("P"), p_params_answer, + f"The list of required parameters should have been {p_params_answer}") + + def test_get_required_runmode_parameters_pcmode(self): + pc_params_answer = ["runmode", "donorvcf", "donorbam", "out", "reference", "varconin"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("PC"), pc_params_answer, + f"The list of required parameters should have been {pc_params_answer}") + + def test_get_required_runmode_parameters_xmode(self): + x_params_answer = ["runmode", "donorvcf", "donorbam", "acceptorbam", "out", "reference"] + self.assertListEqual(self.param_check.get_required_runmode_parameters("X"), x_params_answer, + f"The list of required parameters should have been {x_params_answer}") + + def test_get_required_runmode_parameters_nonexistent(self): + self.assertListEqual(self.param_check.get_required_runmode_parameters("T"), [], + f"The list of required parameters for a non existent mode should have been empty.") + + def test_get_optional_parameters(self): + opt_params_answer = ["fastqout", "varcon", "variantlist", "seed"] + self.assertListEqual(self.param_check.get_optional_parameters(), opt_params_answer, + f"The list of optional parameters should have been {opt_params_answer}") From 12d4a5264cd7d4f5679f0e2df4ec89b264d7f74c Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 17 Oct 2019 13:34:57 +0200 Subject: [PATCH 89/90] Add some tests and add small description about donor insert position output file --- VaSeBuilder.py | 14 +++++++++++++- docs/output_formats.md | 11 +++++++++++ tests/TestVaSeBuilder.py | 6 ++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/VaSeBuilder.py b/VaSeBuilder.py index b52b0c4..6196d03 100644 --- a/VaSeBuilder.py +++ b/VaSeBuilder.py @@ -55,6 +55,17 @@ def __init__(self, vaseid): "car": "Gathering combined context acceptor reads", "done": "Variant complete. Processing"} + self.cigar_tuple_table = {"M": 0, + "I": 1, + "D": 2, + "N": 3, + "S": 4, + "H": 5, + "P": 6, + "-": 7, + "X": 8, + "B": 9} + # Method to print debug messages. def debug_msg(self, step, variant_id, t0=False): process = self.debug_dict[step] @@ -2096,7 +2107,7 @@ def shuffle_donor_add_positions(self, num_of_template_reads, num_of_donor_reads, add_positions = random.sample(range(0, num_of_template_reads), num_of_donor_reads) return add_positions - def write_donor_output_bam(self, bamoutpath, donorreads): + def write_donor_output_bam(self, bamoutpath, donorreads, acceptorbam): """Writes a set of donor reads as a bam file. Parameters @@ -2105,6 +2116,7 @@ def write_donor_output_bam(self, bamoutpath, donorreads): Path and name to write the output to donorreads : list of DonorBamRead Donor reads to place in the output BAM file + acceptorbam """ print("Constructing BAM file") diff --git a/docs/output_formats.md b/docs/output_formats.md index 1b825ac..8903cdf 100644 --- a/docs/output_formats.md +++ b/docs/output_formats.md @@ -15,6 +15,17 @@ _/path/to/donor1.bam


+### Donor insert positions +VaSeBuilder add donor reads at semi random positions in the resulting validation set. The recorded position is the +position of a full read (in a fastq a donor read with insert position 1 would be written to lines 5,6,7 and 8). +
+ +_\#VBUUID {vbuuid}
+ReadId  R1_InsertPos  FastqR1Out  R2_InsertPos  FastqR2Out
+dReadId1  15  /testdata/vase_R1.fastq   15  /testdata/vase_R2.fastq_ +

+ + ### Donor variant files A list of paths to donor variant (VCF/BCF) files used in building the variant contexts. A list could be:

_/path/to/donor1.vcf
diff --git a/tests/TestVaSeBuilder.py b/tests/TestVaSeBuilder.py index 4157740..b30edcd 100644 --- a/tests/TestVaSeBuilder.py +++ b/tests/TestVaSeBuilder.py @@ -316,3 +316,9 @@ def test_link_donor_addpos_reads_v2(self): self.assertDictEqual(self.vs_builder.link_donor_addpos_reads_v2(addposlist, readidlist, donor_reads), donor_addpos_link_answer, f"Both donor read/add position link maps should have been {donor_addpos_link_answer}") + + def test_check_template_size(self): + num_of_template_reads = 56732 + obtained_template_size = self.vs_builder.check_template_size("testdata/fqDir/SRR1039513_1.fastq.gz") + self.assertEqual(obtained_template_size, num_of_template_reads, + f"The number of template reads was expected to be {num_of_template_reads}") From bb390a84f09fc42b985b7407a7822406208b67fc Mon Sep 17 00:00:00 2001 From: MatthieuBeukers Date: Thu, 17 Oct 2019 17:11:20 +0200 Subject: [PATCH 90/90] Start simplyfing VaSeUtils structure and add some tools --- VaSeUtils/VaSeUtilHelper.py | 75 ++++++++++++++++++++++++ VaSeUtils/VaSeUtilRunner.py | 113 ++++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 VaSeUtils/VaSeUtilRunner.py diff --git a/VaSeUtils/VaSeUtilHelper.py b/VaSeUtils/VaSeUtilHelper.py index f0f33c2..be02d67 100644 --- a/VaSeUtils/VaSeUtilHelper.py +++ b/VaSeUtils/VaSeUtilHelper.py @@ -6,6 +6,39 @@ class VaSeUtilHelper: def __init__(self): self.vaseutillogger = logging.getLogger("VaSeUtil_Logger") + self.parameter_map = {"-m": "RUNMODE", + "--runmode": "RUNMODE", + "-v": "DONORVCF", + "--donorvcf": "DONORVCF", + "-b": "DONORBAM", + "--donorbam": "DONORBAM", + "-a": "ACCEPTORBAM", + "--acceptorbam": "ACCEPTORBAM", + "-1": "TEMPLATEFQ1", + "--templatefq1": "TEMPLATEFQ1", + "-2": "TEMPLATEFQ2", + "--templatefq2": "TEMPLATEFQ2", + "-o": "OUT", + "--out": "OUT", + "-r": "REFERENCE", + "--reference": "REFERENCE", + "-of": "", + "--fastqout": "", + "-ov": "", + "--varcon": "", + "-l": "LOG", + "--log": "LOG", + "-!": "DEBUG", + "-vl": "VARIANTLIST", + "--variantlist": "VARIANTLIST", + "-iv": "VARCONIN", + "--varconin": "VARCONIN", + "-dq": "DONORFASTQS", + "--donorfastqs": "DONORFASTQS", + "-c": "CONFIG", + "--config": "CONFIG", + "-s": "SEED", + "--seed": "SEED"} # Returns whether something is in the filter or not def passes_filter(self, valtocheck, filterlist): @@ -116,3 +149,45 @@ def get_read_pair_num(self, pysam_bamread): if pysam_bamread.is_read1: return "1" return "2" + + def determine_variant_type(self, vcfvariantref, vcfvariantalts): + """Determines and returns the + + Parameters + ---------- + vcfvariantref : str + Variant reference allele(s) + vcfvariantalts : tuple of str + Variant alternative allele(s) + + Returns + ------- + str + Type of variant (snp/indel) + """ + maxreflength = max([len(x) for x in vcfvariantref.split(",")]) + maxaltlength = max([len(x) for x in vcfvariantalts]) + + # Check based on the reference and alternative lengths whether the variant is a SNP or indel. + if maxreflength == 1 and maxaltlength == 1: + return "snp" + elif maxreflength > 1 or maxaltlength > 1: + return "indel" + return "?" + + def get_config_param_name(self, paramflag): + """Returns the config parameter name for a parameter flag. + + Parmeters + --------- + paramflag : str + Parameter flag to obtain config parameter + + Returns + ------- + str + Config parameter name if parameter flag is in map, empty string otherwise + """ + if paramflag in self.parameter_map: + return self.parameter_map[paramflag] + return "" diff --git a/VaSeUtils/VaSeUtilRunner.py b/VaSeUtils/VaSeUtilRunner.py new file mode 100644 index 0000000..73bc9c8 --- /dev/null +++ b/VaSeUtils/VaSeUtilRunner.py @@ -0,0 +1,113 @@ +import os +from VariantContextFile import VariantContextFile +from VaSeUtilHelper import VaSeUtilHelper + + +class VaSeUtilRunner: + def __init__(self): + self.vuh = VaSeUtilHelper() + + def check_donor_files_exist(self, donorlistfile, samplefilter=None): + """Checks whether the donor files used in a VaSeBuilder run still exist. + + Parameters + ---------- + donorlistfile : str + File with list of used donor files n VaSeBuilder run + samplefilter : list of str + Filter with sample names + """ + missing_count = 0 + total_count = 0 + try: + with open(donorlistfile, "r") as dlfile: + next(dlfile) # Skip the header line of the file + for fileline in dlfile: + filelinedata = fileline.strip().split("\t") + donorfiles = filelinedata[1].split(",") + + # Check whether the used donor files exist + if self.vuh.passes_filter(filelinedata[0], samplefilter): + for dfile in donorfiles: + total_count += 1 + if not os.path.isfile(dfile): + missing_count += 1 + print(f"Donor file {dfile} for sample {filelinedata[0]} could not be found. Maybe it " + "has been moved, renamed or deleted?") + print(f"Found {total_count - missing_count}/{total_count} donor files.") + except IOError: + print(f"Could not open {donorlistfile}") + finally: + print("Finished running VaSe util CheckDonorFilesExist") + + def combine_variant_context_file(self): + print("aap") + + def generate_config_file_from_command(self, vasebuilder_command): + print("aap") + + def log_info(self, vaselogloc, logfilter): + """Displays log entries satisfying the set log level filter. + + Parameters + ---------- + vaselogloc: + Path to VaSeBuilder log file + logfilter: list of str + Filter with log level(s) + """ + try: + with open(vaselogloc, 'r') as vaselogfile: + for fileline in vaselogfile: + fileline_elements = fileline.split("\t") + if len(fileline_elements) >= 3: + if self.vuh.passes_filter(fileline_elements[2], logfilter): + print(fileline.strip()) + except IOError as ioe: + print(f"Could not open log file") + + def subset_vcf(self, varconfile, vcffileloc): + """Subsets a single VCF file. The filtered entries are written to a new file. + + varconfile : VariantContextFile + Variant contexts to use as filter + vcffileloc : str + Path to the VCF file to filter + """ + tmp_data = vcffileloc.split(".") + out_name = f"{tmp_data[0]}.varconfiltered." + ".".join(tmp_data[1:]) + try: + vcf_file = pysam.VariantFile(vcffileloc, "r") + filtered_vcf_file = pysam.VariantFile(out_name, "w", header=vcf_file.header) + + # Iterate over the VCF file that to be filtered nd write variants overlapping with a context + for vcfvariant in vcf_file.fetch(): + varianttype = self.vuh.determine_variant_type(vcfvariant.ref, vcfvariant.alts) + max_ref = max(len(x) for x in vcfvariant.ref.split(",")) + max_alt = max(len(x) for x in vcfvariant.alts) + var_endpos = max([max_ref, max_alt]) + if varconfile.variant_is_in_context(varianttype, vcfvariant.chrom, vcfvariant.start-1, + vcfvariant.stop+1): + filtered_vcf_file.write(vcfvariant) + + filtered_vcf_file.close() + vcf_file.close() + except IOError: + print(f"Could not process variant file {vcffileloc}") + + def subset_acceptor_vcf(self): + print("aap") + + def subset_vcf_by_variant_contexts(self, variantcontextfile, vcffile_list): + """Subsets a filter of VCF files using a variant context files. + + Parameters + ---------- + variantcontextfile : VariantContextFile + Variant context file to use as filter + vcffile_list : list of str + List of VCF files to filter + """ + varconfile = VariantContextFile(variantcontextfile) + for vcffile in vcffile_list: + self.subset_vcf(varconfile, vcffile)