Skip to content

Commit

Permalink
Add large format vcf option
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed Oct 30, 2023
1 parent 76f10e0 commit edfb356
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 13 deletions.
2 changes: 2 additions & 0 deletions fake_vcf/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def main(
help="Sample prefix ex: SAM => SAM0000001 SAM0000002",
),
phased: bool = typer.Option(default=True, help="Simulate phased"),
large_format: bool = typer.Option(default=True, help="Write large format vcf"),
print_version: bool = typer.Option(
None,
"-v",
Expand All @@ -63,6 +64,7 @@ def main(
seed=seed,
sample_prefix=sample_prefix,
phased=phased,
large_format=large_format,
)


Expand Down
46 changes: 34 additions & 12 deletions fake_vcf/vcf_faker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(
sample_prefix: str = "SAMPLES",
random_seed: Optional[int] = None,
phased: bool = True,
large_format: bool = True,
):
self.num_rows = num_rows
self.rows_remaining = num_rows + 1 # One for the header
Expand All @@ -24,23 +25,32 @@ def __init__(
self.phased = phased
# Use a per instance seed for reproducibility
self.random = random.Random(random_seed)
self.large_format = large_format
self.header = "\n".join(
[
"##fileformat=VCFv4.2",
f"##source=VCFake {version}",
'##FILTER=<ID=PASS,Description="All filters passed">',
'##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
f"##contig=<ID={chromosome}>",
"##reference=ftp://ftp.example.com/sample.fa",
'##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1)">',
'##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
]
) # VCF file format header

self.header = (
if self.large_format:
"\n".join(
[
"##fileformat=VCFv4.2",
f"##source=VCFake {version}",
'##FILTER=<ID=PASS,Description="All filters passed">',
'##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
f"##contig=<ID={chromosome}>",
"##reference=ftp://ftp.example.com/sample.fa",
'##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1)">',
'##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Phased Genotype">',
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">',
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
'##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype Likelihoods">',
]
)
+ "\n"
) # VCF file format header

self.header += "\n"

if num_samples < 1 or num_rows < 1:
raise ValueError(f"Nr of samples and rows must be greater or equal to 1")
Expand Down Expand Up @@ -70,6 +80,18 @@ def __init__(
int(num_samples / 300),
]

if self.large_format:
extra_data = [
"0,30:30:89:913,89,0",
"0,10:10:49:413,33,0",
"0,20:20:55:489,89,0",
"0,40:00:66:726,85,0",
]

self.sample_values = [
f"{sv}:{random.choice(extra_data)}" for sv in self.sample_values
]

self.avail_samples = deque(
self.random.choices(
self.sample_values,
Expand Down
10 changes: 9 additions & 1 deletion fake_vcf/vcf_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,14 @@ def to_vcf_file(virtual_vcf: VirtualVCF, fake_vcf_path: Path, num_rows: int) ->


def fake_vcf_data(
fake_vcf_path, num_rows, num_samples, chromosome, seed, sample_prefix, phased
fake_vcf_path,
num_rows,
num_samples,
chromosome,
seed,
sample_prefix,
phased,
large_format,
):
virtual_vcf = VirtualVCF(
num_rows=num_rows,
Expand All @@ -42,6 +49,7 @@ def fake_vcf_data(
sample_prefix=sample_prefix,
random_seed=seed,
phased=phased,
large_format=large_format,
)

if fake_vcf_path is None:
Expand Down

0 comments on commit edfb356

Please sign in to comment.