From edfb3564ccb0514f22cdcf640821bbfbb580aadd Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Mon, 30 Oct 2023 15:51:37 +0100 Subject: [PATCH] Add large format vcf option --- fake_vcf/__main__.py | 2 ++ fake_vcf/vcf_faker.py | 46 +++++++++++++++++++++++++++++---------- fake_vcf/vcf_generator.py | 10 ++++++++- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/fake_vcf/__main__.py b/fake_vcf/__main__.py index 6da49b6..90a3b0f 100644 --- a/fake_vcf/__main__.py +++ b/fake_vcf/__main__.py @@ -46,6 +46,7 @@ def main( help="Sample prefix ex: SAM => SAM0000001 SAM0000002", ), phased: bool = typer.Option(default=True, help="Simulate phased"), + large_format: bool = typer.Option(default=True, help="Write large format vcf"), print_version: bool = typer.Option( None, "-v", @@ -63,6 +64,7 @@ def main( seed=seed, sample_prefix=sample_prefix, phased=phased, + large_format=large_format, ) diff --git a/fake_vcf/vcf_faker.py b/fake_vcf/vcf_faker.py index 8b9f25e..d05513b 100644 --- a/fake_vcf/vcf_faker.py +++ b/fake_vcf/vcf_faker.py @@ -15,6 +15,7 @@ def __init__( sample_prefix: str = "SAMPLES", random_seed: Optional[int] = None, phased: bool = True, + large_format: bool = True, ): self.num_rows = num_rows self.rows_remaining = num_rows + 1 # One for the header @@ -24,23 +25,32 @@ def __init__( self.phased = phased # Use a per instance seed for reproducibility self.random = random.Random(random_seed) + self.large_format = large_format + self.header = "\n".join( + [ + "##fileformat=VCFv4.2", + f"##source=VCFake {version}", + '##FILTER=', + '##INFO=', + f"##contig=", + "##reference=ftp://ftp.example.com/sample.fa", + '##INFO=', + '##INFO=', + '##FORMAT=', + ] + ) # VCF file format header - self.header = ( + if self.large_format: "\n".join( [ - "##fileformat=VCFv4.2", - f"##source=VCFake {version}", - '##FILTER=', - '##INFO=', - f"##contig=", - "##reference=ftp://ftp.example.com/sample.fa", - '##INFO=', - '##INFO=', - '##FORMAT=', + '##FORMAT=', + '##FORMAT=', + '##FORMAT=', + '##FORMAT=', ] ) - + "\n" - ) # VCF file format header + + self.header += "\n" if num_samples < 1 or num_rows < 1: raise ValueError(f"Nr of samples and rows must be greater or equal to 1") @@ -70,6 +80,18 @@ def __init__( int(num_samples / 300), ] + if self.large_format: + extra_data = [ + "0,30:30:89:913,89,0", + "0,10:10:49:413,33,0", + "0,20:20:55:489,89,0", + "0,40:00:66:726,85,0", + ] + + self.sample_values = [ + f"{sv}:{random.choice(extra_data)}" for sv in self.sample_values + ] + self.avail_samples = deque( self.random.choices( self.sample_values, diff --git a/fake_vcf/vcf_generator.py b/fake_vcf/vcf_generator.py index 89c6fee..98f82f8 100644 --- a/fake_vcf/vcf_generator.py +++ b/fake_vcf/vcf_generator.py @@ -33,7 +33,14 @@ def to_vcf_file(virtual_vcf: VirtualVCF, fake_vcf_path: Path, num_rows: int) -> def fake_vcf_data( - fake_vcf_path, num_rows, num_samples, chromosome, seed, sample_prefix, phased + fake_vcf_path, + num_rows, + num_samples, + chromosome, + seed, + sample_prefix, + phased, + large_format, ): virtual_vcf = VirtualVCF( num_rows=num_rows, @@ -42,6 +49,7 @@ def fake_vcf_data( sample_prefix=sample_prefix, random_seed=seed, phased=phased, + large_format=large_format, ) if fake_vcf_path is None: