From cc40aa4b066dee47c2a561ba70424f24442980a9 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Fri, 26 Apr 2024 14:58:46 +0100 Subject: [PATCH 1/3] add handling of index error when finding variant position for spiking into VCF --- src/pheval/prepare/create_spiked_vcf.py | 20 +++++++++++++++----- tests/test_create_spiked_vcf.py | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/pheval/prepare/create_spiked_vcf.py b/src/pheval/prepare/create_spiked_vcf.py index e76ef8ed6..e2a835ce3 100644 --- a/src/pheval/prepare/create_spiked_vcf.py +++ b/src/pheval/prepare/create_spiked_vcf.py @@ -337,13 +337,23 @@ def construct_vcf_records(self) -> List[str]: """ updated_vcf_records = copy(self.vcf_contents) for variant in self.proband_causative_variants: - variant = self.construct_variant_entry(variant) - variant_entry_position = [ + variant_entry = self.construct_variant_entry(variant) + matching_indices = [ i for i, val in enumerate(updated_vcf_records) - if val.split("\t")[0] == variant[0] and int(val.split("\t")[1]) < int(variant[1]) - ][-1] + 1 - updated_vcf_records.insert(variant_entry_position, "\t".join(variant)) + if val.split("\t")[0] == variant_entry[0] + and int(val.split("\t")[1]) < int(variant_entry[1]) + ] + if matching_indices: + variant_entry_position = matching_indices[-1] + 1 + else: + info_log.warning( + f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-" + f"{variant.variant.ref}-{variant.variant.alt} in VCF records, " + "inserting at end of VCF contents." + ) + variant_entry_position = len(updated_vcf_records) + updated_vcf_records.insert(variant_entry_position, "\t".join(variant_entry)) return updated_vcf_records def construct_header(self, updated_vcf_records: List[str]) -> List[str]: diff --git a/tests/test_create_spiked_vcf.py b/tests/test_create_spiked_vcf.py index 1667ec0c2..92d0d7fff 100644 --- a/tests/test_create_spiked_vcf.py +++ b/tests/test_create_spiked_vcf.py @@ -243,6 +243,18 @@ def setUpClass(cls) -> None: ], VcfHeader("TEMPLATE", "GRCh37", True), ) + cls.vcf_spiker_new_variant_chrom = VcfSpiker( + hg19_vcf, + [ + ProbandCausativeVariant( + "TEST1", + "GRCh37", + GenomicVariant("X", 123450, "G", "A"), + "heterozygous", + ) + ], + VcfHeader("TEMPLATE", "GRCh37", True), + ) cls.vcf_spiker_multiple_variants = VcfSpiker( hg19_vcf, [ @@ -339,6 +351,13 @@ def test_construct_vcf_records_multiple_variants(self): "chr3\t61580860\t.\tG\tA\t100\tPASS\t.\t" "GT\t1/1\n", ) + def test_construct_vcf_records_new_variant_pos(self): + updated_records = self.vcf_spiker_new_variant_chrom.construct_vcf_records() + self.assertEqual( + updated_records[48], + "chrX\t123450\t.\tG\tA\t100\tPASS\t.\tGT\t0/1\n", + ) + def test_construct_header(self): self.assertEqual( [ From 59505022acf8450b34826957eff9f22f92f77c19 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Tue, 30 Apr 2024 13:44:19 +0100 Subject: [PATCH 2/3] print VCF file name when unable to find variant entry position --- src/pheval/prepare/create_spiked_vcf.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/pheval/prepare/create_spiked_vcf.py b/src/pheval/prepare/create_spiked_vcf.py index e2a835ce3..ec25806ee 100644 --- a/src/pheval/prepare/create_spiked_vcf.py +++ b/src/pheval/prepare/create_spiked_vcf.py @@ -328,10 +328,13 @@ def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) genotype_codes[proband_variant_data.genotype.lower()] + "\n", ] - def construct_vcf_records(self) -> List[str]: + def construct_vcf_records(self, template_vcf_name: str) -> List[str]: """ Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. + Args: + template_vcf_name (str): Name of the template VCF file. + Returns: List[str]: Updated VCF records containing the spiked variants. """ @@ -349,7 +352,7 @@ def construct_vcf_records(self) -> List[str]: else: info_log.warning( f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-" - f"{variant.variant.ref}-{variant.variant.alt} in VCF records, " + f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}, " "inserting at end of VCF contents." ) variant_entry_position = len(updated_vcf_records) @@ -375,14 +378,17 @@ def construct_header(self, updated_vcf_records: List[str]) -> List[str]: updated_vcf_file.append(text) return updated_vcf_file - def construct_vcf(self) -> List[str]: + def construct_vcf(self, template_vcf_name: str) -> List[str]: """ Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. + Args: + template_vcf_name (str): Name of the template VCF file. + Returns: List[str]: The complete spiked VCF file content as a list of strings. """ - return self.construct_header(self.construct_vcf_records()) + return self.construct_header(self.construct_vcf_records(template_vcf_name)) class VcfWriter: @@ -464,7 +470,7 @@ def spike_vcf_contents( chosen_template_vcf.vcf_contents, phenopacket_causative_variants, chosen_template_vcf.vcf_header, - ).construct_vcf(), + ).construct_vcf(chosen_template_vcf.vcf_file_name), ) From 79fb20c1fa41c26f114370fdd8a443050a86717f Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Tue, 30 Apr 2024 13:51:13 +0100 Subject: [PATCH 3/3] add missing arguments for template vcf name --- tests/test_create_spiked_vcf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_create_spiked_vcf.py b/tests/test_create_spiked_vcf.py index 92d0d7fff..f2b0b38d0 100644 --- a/tests/test_create_spiked_vcf.py +++ b/tests/test_create_spiked_vcf.py @@ -336,12 +336,12 @@ def test_construct_variant_structural_variant(self): def test_construct_vcf_records_single_variant(self): self.assertEqual( - self.vcf_spiker.construct_vcf_records()[40], + self.vcf_spiker.construct_vcf_records("template.vcf")[40], "chr1\t886190\t.\tG\tA\t100\tPASS\t.\t" "GT\t0/1\n", ) def test_construct_vcf_records_multiple_variants(self): - updated_records = self.vcf_spiker_multiple_variants.construct_vcf_records() + updated_records = self.vcf_spiker_multiple_variants.construct_vcf_records("template.vcf") self.assertEqual( updated_records[40], "chr1\t886190\t.\tG\tA\t100\tPASS\t.\t" "GT\t0/1\n", @@ -352,7 +352,7 @@ def test_construct_vcf_records_multiple_variants(self): ) def test_construct_vcf_records_new_variant_pos(self): - updated_records = self.vcf_spiker_new_variant_chrom.construct_vcf_records() + updated_records = self.vcf_spiker_new_variant_chrom.construct_vcf_records("template.vcf") self.assertEqual( updated_records[48], "chrX\t123450\t.\tG\tA\t100\tPASS\t.\tGT\t0/1\n",