From 590eb5acaf1a33a4dac8ce97b97699fcd77b58c3 Mon Sep 17 00:00:00 2001 From: Edgardo Ortiz Date: Thu, 18 Mar 2021 14:48:46 +0100 Subject: [PATCH] Updating to v2.6 Added support for GVCF from GATK Added option to redirect output to a specific folder Added option to rename the prefix of the output matrices Improved error catching for malformed lines in VCF --- .gitignore | 2 +- README.md | 27 +++++++++---- vcf2phylip.py | 107 ++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 98 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 253ed50..4b6b0a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ .DS_Store -vcf2phylip_v2.4.py +vcf2phylip_v*.py diff --git a/README.md b/README.md index 6de2264..50b2a32 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # vcf2phylip -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2540861.svg)](https://doi.org/10.5281/zenodo.2540861) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2540861.svg)](https://doi.org/10.5281/zenodo.2540861) Convert SNPs in VCF format to PHYLIP, NEXUS, binary NEXUS, or FASTA alignments for phylogenetic analysis ## _Brief description_ @@ -17,8 +17,9 @@ Please don't hesitate to open an [`Issue`](https://github.com/edgardomortiz/vcf2 Just type `python vcf2phylip.py -h` to show the help of the program: ``` -usage: vcf2phylip.py [-h] -i FILENAME [-m MIN_SAMPLES_LOCUS] [-o OUTGROUP] - [-p] [-f] [-n] [-b] [-r] [-v] +usage: vcf2phylip.py [-h] -i FILENAME [--output-folder FOLDER] + [--output-prefix PREFIX] [-m MIN_SAMPLES_LOCUS] + [-o OUTGROUP] [-p] [-f] [-n] [-b] [-r] [-v] The script converts a collection of SNPs in VCF format into a PHYLIP, FASTA, NEXUS, or binary NEXUS file for phylogenetic analysis. The code is optimized @@ -31,6 +32,12 @@ optional arguments: -h, --help show this help message and exit -i FILENAME, --input FILENAME Name of the input VCF file, can be gzipped + --output-folder FOLDER + Output folder name, it will be created if it does not + exist (same folder as input by default) + --output-prefix PREFIX + Prefix for output filenames (same as the input VCF + filename without the extension by default) -m MIN_SAMPLES_LOCUS, --min-samples-locus MIN_SAMPLES_LOCUS Minimum of samples required to be present at a locus (default=4) @@ -39,13 +46,13 @@ optional arguments: written as first taxon in the alignment. -p, --phylip-disable A PHYLIP matrix is written by default unless you enable this flag - -f, --fasta Write a FASTA matrix, disabled by default - -n, --nexus Write a NEXUS matrix, disabled by default + -f, --fasta Write a FASTA matrix (disabled by default) + -n, --nexus Write a NEXUS matrix (disabled by default) -b, --nexus-binary Write a binary NEXUS matrix for analysis of biallelic SNPs in SNAPP, only diploid genotypes will be - processed, disabled by default. + processed (disabled by default) -r, --resolve-IUPAC Randomly resolve heterozygous genotypes to avoid IUPAC - ambiguities in the matrices + ambiguities in the matrices (disabled by default) -v, --version show program's version number and exit ``` @@ -92,12 +99,16 @@ python vcf2phylip.py -i myfile.vcf -r # This command will create only a PHYLIP matrix called myfile_min4.phy where IUPAC ambiguites have been randomly resolved ``` +_Example 6:_ Specify output folder and output prefix: +```bash +python vcf2phylip.py -i myfile.vcf.gz --output-folder /data/results --output-prefix mymatrix +# This command will create the file `matrix.min4.phy` in the folder `/data/results` ## _Credits_ - Code: [Edgardo M. Ortiz](mailto:e.ortiz.v@gmail.com) - Data and testing: [Juan D. Palacio-Mejía](mailto:jdpalacio@gmail.com) ## _Citation_ -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2540861.svg)](https://doi.org/10.5281/zenodo.2540861) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2540861.svg)](https://doi.org/10.5281/zenodo.2540861) **Ortiz, E.M. 2019.** vcf2phylip v2.0: convert a VCF matrix into several matrix formats for phylogenetic analysis. DOI:10.5281/zenodo.2540861 diff --git a/vcf2phylip.py b/vcf2phylip.py index 187ce88..1231f19 100755 --- a/vcf2phylip.py +++ b/vcf2phylip.py @@ -3,7 +3,7 @@ """ -The script converts a collection of SNPs in VCF format into a PHYLIP, FASTA, +The script converts a collection of SNPs in VCF format into a PHYLIP, FASTA, NEXUS, or binary NEXUS file for phylogenetic analysis. The code is optimized to process VCF files with sizes >1GB. For small VCF files the algorithm slows down as the number of taxa increases (but is still fast). @@ -14,16 +14,16 @@ __author__ = "Edgardo M. Ortiz" __credits__ = "Juan D. Palacio-Mejía" -__version__ = "2.5" +__version__ = "2.6" __email__ = "e.ortiz.v@gmail.com" -__date__ = "2021-03-16" +__date__ = "2021-03-18" import argparse import gzip -import os import random import sys +from pathlib import Path # Dictionary of IUPAC ambiguities for nucleotides @@ -64,7 +64,7 @@ def extract_sample_names(vcf_file): """ Extract sample names from VCF file """ - if vcf_file.endswith(".gz"): + if vcf_file.lower().endswith(".gz"): opener = gzip.open else: opener = open @@ -89,11 +89,13 @@ def is_anomalous(record, num_samples): def is_snp(record): """ - Determine if current VCF record is a SNP (single nucleotide polymorphism) as opposed to MNP + Determine if current VCF record is a SNP (single nucleotide polymorphism) as opposed to MNP (multinucleotide polymorphism) """ - return bool(len(record[3]) == 1 - and len(record[4]) - record[4].count(",") == record[4].count(",") + 1) + # must be replaced by the REF in the ALT field for GVCFs from GATK + alt = record[4].replace("", record[3]) + return bool(len(record[3]) == 1 + and len(alt) - alt.count(",") == alt.count(",") + 1) def num_genotypes(record, num_samples): @@ -112,14 +114,18 @@ def get_matrix_column(record, num_samples, resolve_IUPAC): Transform a VCF record into a phylogenetic matrix column with nucleotides instead of numbers """ nt_dict = {str(0): record[3].replace("-","*").upper(), ".": "N"} - alt = record[4].replace("-", "*") + # must be replaced by the REF in the ALT field for GVCFs from GATK + alt = record[4].replace("-", "*").replace("", nt_dict["0"]) alt = alt.split(",") for n in range(len(alt)): nt_dict[str(n+1)] = alt[n] column = "" for i in range(9, num_samples + 9): geno_num = record[i].split(":")[0].replace("/", "").replace("|", "") - geno_nuc = "".join(sorted(set([nt_dict[j] for j in geno_num]))) + try: + geno_nuc = "".join(sorted(set([nt_dict[j] for j in geno_num]))) + except KeyError: + return "malformed" if len(geno_nuc) == 1: column += geno_nuc elif resolve_IUPAC is False: @@ -131,13 +137,13 @@ def get_matrix_column(record, num_samples, resolve_IUPAC): def get_matrix_column_bin(record, num_samples): """ - Return an alignment column in NEXUS binary from a VCF record, if genotype is not diploid with at + Return an alignment column in NEXUS binary from a VCF record, if genotype is not diploid with at most two alleles it will return '?' as state """ column = "" for i in range(9, num_samples + 9): genotype = record[i].split(":")[0] - if len(genotype) == 3: + if genotype in gen_bin: column += gen_bin[genotype] else: column += "?" @@ -145,13 +151,24 @@ def get_matrix_column_bin(record, num_samples): def main(): - parser = argparse.ArgumentParser(description=__doc__, + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-i", "--input", action = "store", dest = "filename", required = True, help = "Name of the input VCF file, can be gzipped") + parser.add_argument("--output-folder", + action = "store", + dest = "folder", + default = "./", + help = "Output folder name, it will be created if it does not exist (same folder as input by " + "default)") + parser.add_argument("--output-prefix", + action = "store", + dest = "prefix", + help = "Prefix for output filenames (same as the input VCF filename without the extension by " + "default)") parser.add_argument("-m", "--min-samples-locus", action = "store", dest = "min_samples_locus", @@ -171,20 +188,21 @@ def main(): parser.add_argument("-f", "--fasta", action = "store_true", dest = "fasta", - help = "Write a FASTA matrix, disabled by default") + help = "Write a FASTA matrix (disabled by default)") parser.add_argument("-n", "--nexus", action = "store_true", dest = "nexus", - help = "Write a NEXUS matrix, disabled by default") + help = "Write a NEXUS matrix (disabled by default)") parser.add_argument("-b", "--nexus-binary", action = "store_true", dest = "nexusbin", help = "Write a binary NEXUS matrix for analysis of biallelic SNPs in SNAPP, only diploid " - "genotypes will be processed, disabled by default.") + "genotypes will be processed (disabled by default)") parser.add_argument("-r", "--resolve-IUPAC", action = "store_true", dest = "resolve_IUPAC", - help = "Randomly resolve heterozygous genotypes to avoid IUPAC ambiguities in the matrices") + help = "Randomly resolve heterozygous genotypes to avoid IUPAC ambiguities in the matrices " + "(disabled by default)") parser.add_argument("-v", "--version", action = "version", version = "%(prog)s {version}".format(version=__version__)) @@ -192,6 +210,8 @@ def main(): filename = args.filename + folder = args.folder + prefix = args.prefix min_samples_locus = args.min_samples_locus outgroup = args.outgroup.split(",")[0].split(";")[0] phylipdisable = args.phylipdisable @@ -202,7 +222,11 @@ def main(): # Get samples names and number of samples in VCF - sample_names = extract_sample_names(filename) + if Path(filename).exists(): + sample_names = extract_sample_names(filename) + else: + print("\nInput VCF file not found, please verify the provided path") + sys.exit() num_samples = len(sample_names) if num_samples == 0: print("\nSample names not found in VCF, your file may be corrupt or missing the header.\n") @@ -214,14 +238,28 @@ def main(): min_samples_locus = min(num_samples, min_samples_locus) # Output filename will be the same as input file, indicating the minimum of samples specified - if filename.endswith(".gz"): - outfile = filename.replace(".vcf.gz",".min"+str(min_samples_locus)) - else: - outfile = filename.replace(".vcf",".min"+str(min_samples_locus)) - # We need to create an intermediate file to hold the sequence data vertically and then transpose + if not prefix: + parts = Path(filename).name.split(".") + prefix = [] + for p in parts: + if p.lower() == "vcf": + break + else: + prefix.append(p) + prefix = ".".join(prefix) + prefix += ".min" + str(min_samples_locus) + + # Check if outfolder exists, create it if it doesn't + if not Path(folder).exists(): + Path(folder).mkdir(parents=True) + + outfile = str(Path(folder, prefix)) + + # We need to create an intermediate file to hold the sequence data vertically and then transpose # it to create the matrices if fasta or nexus or not phylipdisable: temporal = open(outfile+".tmp", "w") + # If binary NEXUS is selected also create a separate temporal if nexusbin: temporalbin = open(outfile+".bin.tmp", "w") @@ -230,7 +268,7 @@ def main(): ########################## # PROCESS GENOTYPES IN VCF - if filename.endswith(".gz"): + if filename.lower().endswith(".gz"): opener = gzip.open else: opener = open @@ -261,7 +299,7 @@ def main(): if snp_num % 500000 == 0: print("{:d} genotypes processed.".format(snp_num)) if is_anomalous(record, num_samples): - print("Skipped potentially malformed line: {}".format(line)) + print("Skipping malformed line:\n{}".format(line)) continue else: # Check if the SNP has the minimum number of samples required @@ -276,19 +314,25 @@ def main(): snp_accepted += 1 # If nucleotide matrices are requested if fasta or nexus or not phylipdisable: + # Uncomment for debugging + # print(record) # Transform VCF record into an alignment column site_tmp = get_matrix_column(record, num_samples, resolve_IUPAC) # Uncomment for debugging # print(site_tmp) # Write entire row of single nucleotide genotypes to temp file - temporal.write(site_tmp+"\n") + if site_tmp == "malformed": + print("Skipping malformed line:\n{}".format(line)) + continue + else: + temporal.write(site_tmp+"\n") # Write binary NEXUS for SNAPP if requested if nexusbin: # Check that the SNP only has two alleles if len(record[4]) == 1: # Add to running sum of biallelic SNPs snp_biallelic += 1 - # Translate genotype into 0 for homozygous REF, 1 for + # Translate genotype into 0 for homozygous REF, 1 for # heterozygous, and 2 for homozygous ALT binsite_tmp = get_matrix_column_bin(record, num_samples) # Write entire row to temporary file @@ -426,21 +470,26 @@ def main(): print("Sample {:d} of {:d}, '{}', added to the binary matrix.".format( s+1, len(sample_names), sample_names[s])) + print() if not phylipdisable: + print("PHYLIP matrix saved to: " + outfile+".phy") output_phy.close() if fasta: + print("FASTA matrix saved to: " + outfile+".fasta") output_fas.close() if nexus: output_nex.write(";\nEND;\n") + print("NEXUS matrix saved to: " + outfile+".nex") output_nex.close() if nexusbin: output_nexbin.write(";\nEND;\n") + print("BINARY NEXUS matrix saved to: " + outfile+".bin.nex") output_nexbin.close() if fasta or nexus or not phylipdisable: - os.remove(outfile+".tmp") + Path(outfile+".tmp").unlink() if nexusbin: - os.remove(outfile+".bin.tmp") + Path(outfile+".bin.tmp").unlink() print( "\nDone!\n")