diff --git a/chew/data/GRCh37_sites.bed.gz b/chew/data/GRCh37_sites.bed.gz index 0d006b8..e4fd174 100644 Binary files a/chew/data/GRCh37_sites.bed.gz and b/chew/data/GRCh37_sites.bed.gz differ diff --git a/chew/data/GRCh37_sitesX.bed.gz b/chew/data/GRCh37_sitesX.bed.gz index 5320024..8705564 100644 Binary files a/chew/data/GRCh37_sitesX.bed.gz and b/chew/data/GRCh37_sitesX.bed.gz differ diff --git a/chew/data/GRCh38_sites.bed.gz b/chew/data/GRCh38_sites.bed.gz index 323f38a..c5dddbc 100644 Binary files a/chew/data/GRCh38_sites.bed.gz and b/chew/data/GRCh38_sites.bed.gz differ diff --git a/chew/data/GRCh38_sitesX.bed.gz b/chew/data/GRCh38_sitesX.bed.gz index 555372b..b879b60 100644 Binary files a/chew/data/GRCh38_sitesX.bed.gz and b/chew/data/GRCh38_sitesX.bed.gz differ diff --git a/misc/sites/README.md b/misc/sites/README.md new file mode 100644 index 0000000..eeacd03 --- /dev/null +++ b/misc/sites/README.md @@ -0,0 +1,32 @@ +# NGS Chew Sites Related Computation + +This directory contains the documentation on the `ngs-chew` sites are computed and verified as well as the generation of the PCA projection. + +## Site Selection + +The details can be found in the `select-auto` and `select-xy` directories. +To rerun it, obtain dbSNP GRCh37/GRCh38 files and `pip -r install /requirements/misc.txt`. +Then run the following: + +``` +PATH_DBSNP37=path/to/dbsnp-b151-grch37.vcf.gz \ +PATH_DBSNP38=path/to/dbsnp-b151-grch38.vcf.gz \ + snakemake -c -p work/label/grch37.common.bed +``` + +## Autosomal Site Selection + +We base the selection of our autosomal sites on the GRCh37 sites from [peddy](https://github.com/brentp/peddy). +However, we limit the sites to those that have an RSID and are also in GRCh38. +The sites are in the order of the rsid. +This allows to re-use the fingerprints for both releases with only small limitations. + +## Gonosomal Site Selection + +See [this issue](https://github.com/bihealth/ngs-chew/issues/20) for the original selection of GRCh37 sites. +We use a similar approach as for autosomal sites. +However, these sites are only used for prediction of gonomosomal karyotypes and not fingerprints. + +## Population Group Analysis + +The folder `pca` contains a Snakemake workflow that infers population groups / clusters from the thousand genotypes data at our selected sites. diff --git a/misc/sites/pca/.gitignore b/misc/sites/pca/.gitignore new file mode 100644 index 0000000..38e3a4a --- /dev/null +++ b/misc/sites/pca/.gitignore @@ -0,0 +1,6 @@ +/*.tbi +.snakemake/* +output/* +raw_data/* +work/* +by_id/* diff --git a/misc/sites/pca/PCA.ipynb b/misc/sites/pca/PCA.ipynb new file mode 100644 index 0000000..63b3f42 --- /dev/null +++ b/misc/sites/pca/PCA.ipynb @@ -0,0 +1,132 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "import matplotlib.pyplot as plt\n", + "import plotly.express as px\n", + "import polars as pl\n", + "import pickle\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels = pl.read_csv(\n", + " \"raw_data/g1k/founders.tsv\",\n", + " separator=\" \",\n", + " has_header=False,\n", + ")\n", + "labels.columns = (\"FAM\", \"NAME\", \"FATHER\", \"MOTHER\", \"SEX\", \"GROUP\", \"SUPERGROUP\")\n", + "groups = labels.get_column(\"GROUP\")\n", + "y = labels.get_column(\"SUPERGROUP\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_pl = pl.read_csv(\n", + " \"by_id/genotypes.tsv\",\n", + " separator=\"\\t\",\n", + " has_header=True,\n", + ")\n", + "\n", + "X = data_pl.drop(\"CHROM\", \"POS\", \"ID\").fill_null(0).to_numpy()\n", + "#data_np.transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca = PCA(n_components=2)\n", + "pca.fit(X.transpose())\n", + "X_reduced = pca.transform(X.transpose())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('pca.pkl', 'wb') as pickle_file:\n", + " pickle.dump(pca, pickle_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.scatter(X_reduced, x=0, y=1, color=y)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf = RandomForestClassifier()\n", + "X_train, X_test, y_train, y_test = train_test_split(X_reduced, y)\n", + "rf.fit(X_train, y_train)\n", + "rf.predict_proba(X_test)\n", + "pca.transform(X_test.transpose())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ngs-chew", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/misc/sites/pca/Snakefile b/misc/sites/pca/Snakefile new file mode 100644 index 0000000..dde37b4 --- /dev/null +++ b/misc/sites/pca/Snakefile @@ -0,0 +1,106 @@ +rule obtain_calls: + output: + vcf="raw_data/g1k/genotypes.vcf", + params: + bed="../../../chew/data/GRCh38_sites.bed.gz", + threads: 100 + shell: + r""" + set -x + + url() + {{ + echo http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr$1.recalibrated_variants.vcf.gz + }} + export -f url + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + tabix --only-header $(url 1) \ + > {output.vcf} + + for i in {{1..22}}; do + zgrep "^$i\s" {params.bed} \ + | sed -e 's/^/chr/' \ + | sort -k2,2n \ + > $TMPDIR/chr.$i.bed + split -l 10 $TMPDIR/chr.$i.bed $TMPDIR/chr.$i.bed- + done + + parallel -k -j {threads} 'i=$(basename {{}} | cut -d . -f 2); tabix -R {{}} $(url $i)' ::: $TMPDIR/chr.*.bed-* \ + >> {output.vcf} + """ + + +rule obtain_pedigrees: + output: + txt="raw_data/g1k/20130606_g1k_3202_samples_ped_population.tsv", + txt_founders="raw_data/g1k/founders.tsv", + shell: + r""" + wget -O {output.txt} \ + http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt + + awk -F ' ' ' + ((NR > 1) && ($3 == 0) && ($4 == 0))' \ + {output.txt} \ + > {output.txt_founders} + """ + + +rule vcf_to_tsv: + input: + # vcf="raw_data/g1k/genotypes.vcf", + txt_founders="raw_data/g1k/founders.tsv", + output: + tsv="raw_data/g1k/genotypes.tsv", + shell: + r""" + ( + echo -en "CHROM\tPOS\tREF\tALT\t"; \ + cut -d ' ' -f 2 raw_data/g1k/founders.tsv | tr '\n' '\t' | sed -e 's/\t$//g'; \ + echo; \ + ) > {output.tsv} + + set +o pipefail + bcftools +missing2ref raw_data/g1k/genotypes.vcf \ + | bcftools view -S <(cut -d ' ' -f 2 raw_data/g1k/founders.tsv) -i '(type="snp")' \ + | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' - \ + | sed -e 's|[2-9]/|1/|g' \ + | sed -e 's|/[2-9]|/1|g' \ + | sed -e 's|,.||g' \ + | sed -e 's|,.||g' \ + | sed -e 's|,.||g' \ + | sed -e 's|0/0|0|g' \ + | sed -e 's|0/1|1|g' \ + | sed -e 's|1/0|1|g' \ + | sed -e 's|1/1|2|g' \ + >> {output.tsv} + """ + +rule join_tsv: + input: + tsv="raw_data/g1k/genotypes.tsv", + output: + by_id_tsv="by_id/ids.tsv", + tsv="by_id/genotypes.tsv", + params: + bed="../../../chew/data/GRCh38_sites.bed.gz", + shell: + r""" + echo -e "CHROM_\tPOS0_\tPOS_\tID" \ + > {output.by_id_tsv} + zcat {params.bed} \ + | sed -e 's/^/chr/' \ + >> {output.by_id_tsv} + cp {output.by_id_tsv} {output.by_id_tsv}.bak + + qsv join --delimiter $'\t' --left \ + CHROM_,POS_ {output.by_id_tsv} \ + CHROM,POS {input.tsv} \ + | qsv select '!CHROM,POS0_,POS,REF,ALT' - \ + | sed -e 's/CHROM_/CHROM/g' -e 's/POS_/POS/g' \ + | tr , '\t' \ + > {output.tsv} + """ diff --git a/misc/sites/select-auto/.gitignore b/misc/sites/select-auto/.gitignore new file mode 100644 index 0000000..f422783 --- /dev/null +++ b/misc/sites/select-auto/.gitignore @@ -0,0 +1,4 @@ +.snakemake/* +output/* +raw_data/* +work/* diff --git a/misc/sites/select-auto/Snakefile b/misc/sites/select-auto/Snakefile new file mode 100644 index 0000000..d9655c9 --- /dev/null +++ b/misc/sites/select-auto/Snakefile @@ -0,0 +1,218 @@ +envvars: + "PATH_DBSNP37", # GRCh37 dbSNP VCF + "PATH_DBSNP38", # GRCh38 dbSNP VCF + + +rule default: + input: + "output/common_ids.txt", + "output/grch37.common.bed", + "output/grch38.common.bed", + + +rule raw_data_peddy: + output: + "raw_data/peddy/{filename}.sites", + shell: + """ + wget -O {output} https://github.com/brentp/peddy/raw/master/peddy/{wildcards.filename}.sites + """ + + +rule sites_to_bed: + input: + "raw_data/peddy/{filename}.sites", + output: + bed_header="raw_data/peddy/{filename}.header.bed", + bed_noheader="raw_data/peddy/{filename}.bed", + shell: + r""" + echo -e "CHROM\tBEGIN\tEND\tREF\tALT" \ + > {output.bed_header} + + awk -F : ' + BEGIN {{ + OFS="\t"; + }} + {{ + print $1, $2 - 1, $2, $3, $4 + }} + ' {input} \ + | LC_ALL=C sort -k1,1V -k2,2n \ + | tee -a {output.bed_header} \ + > {output.bed_noheader} + """ + + +rule extract_dbsnp: + input: + bed="raw_data/peddy/{filename}.bed", + output: + vcf="work/dbsnp/{filename}.vcf", + params: + path_dbsnp=os.environ["PATH_DBSNP37"], + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + tabix --only-header {params.path_dbsnp} \ + > $TMPDIR/dbsnp.vcf + (tabix -R {input.bed} {params.path_dbsnp} || true) \ + | sort -k1,1V -k2,2n \ + >> $TMPDIR/dbsnp.vcf + bgzip $TMPDIR/dbsnp.vcf + tabix -f $TMPDIR/dbsnp.vcf.gz + + bcftools sort -T $TMPDIR/bcftools.XXXXXX $TMPDIR/dbsnp.vcf.gz \ + | bcftools norm -d both \ + | bcftools norm -m - \ + > {output.vcf} + """ + + +rule dbsnp_to_bed: + input: + vcf="work/dbsnp/{filename}.vcf", + output: + bed="work/dbsnp/{filename}.bed", + shell: + r""" + echo -e "CHROM_\tBEGIN_\tEND_\tREF_\tALT_\tID" \ + > {output.bed} + bcftools query \ + -f '%CHROM\t%POS0\t%POS\t%REF\t%ALT\t%ID\n' \ + {input.vcf} \ + >> {output.bed} + """ + + +rule label_sites: + input: + bed_sites="raw_data/peddy/GRCH37.header.bed", + bed_dbsnp="work/dbsnp/GRCH37.bed", + output: + bed="work/label/grch37.bed", + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + qsv join --delimiter $'\t' --left \ + CHROM,BEGIN,REF,ALT {input.bed_sites} \ + CHROM_,BEGIN_,REF_,ALT_ {input.bed_dbsnp} \ + | qsv select CHROM,BEGIN,END,REF,ALT,ID \ + | tr ',' '\t' \ + > {output.bed} + """ + + +rule extract_dbsnp38_vcf: + input: + bed="work/label/grch37.bed", + output: + vcf="work/dbsnp/GRCH38.vcf", + vcf_gz="work/dbsnp/GRCH38.vcf.gz", + tbi="work/dbsnp/GRCH38.vcf.gz.tbi", + params: + path_dbsnp=os.environ["PATH_DBSNP38"], + threads: 8 + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + tail -n +2 {input.bed} \ + | cut -f 6 \ + | sed -e 's/^/\t/' -e 's/$/\t/' \ + > $TMPDIR/rsids.txt + + tabix --only-header {params.path_dbsnp} \ + > {output.vcf} + parallel -k -j {threads} \ + 'tabix {params.path_dbsnp} {{}} | (fgrep -f $TMPDIR/rsids.txt || true)' ::: $(tabix -l {params.path_dbsnp}) \ + >> {output.vcf} + + bgzip -c {output.vcf} \ + > {output.vcf_gz} + tabix -f {output.vcf_gz} + """ + + +rule convert_dbsnp38_vcf_to_bed: + input: + vcf="work/dbsnp/GRCH38.vcf.gz", + output: + bed="work/label/grch38.bed", + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + set +o pipefail + + echo -e "CHROM\tBEGIN\tEND\tREF\tALT\tID" \ + > {output.bed} + bcftools sort -T $TMPDIR/sort.1.XXXXXX {input.vcf} \ + | bcftools norm -m -any \ + | bcftools query \ + -f '%CHROM\t%POS0\t%POS\t%REF\t%ALT\t%ID\n' \ + | sort -k1,1V -k2,2n \ + | uniq \ + >> {output.bed} + """ + + +rule extract_common_dbsnp_entries: + input: + grch37="work/label/grch37.bed", + grch38="work/label/grch38.bed", + output: + common_ids="output/common_ids.txt", + grch37="output/grch37.common.bed", + grch38="output/grch38.common.bed", + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + echo -e "REF_\tID_" \ + >{output.common_ids} + qsv join --delimiter $'\t' \ + REF,ID {input.grch37} \ + REF,ID {input.grch38} \ + | qsv select REF,ID \ + | tr ',' '\t' \ + | uniq \ + | tail -n +2 \ + >> {output.common_ids} + + qsv join --delimiter $'\t' \ + REF,ID {input.grch37} \ + REF_,ID_ {output.common_ids} \ + | qsv select CHROM,BEGIN,END,ID \ + | tr ',' '\t' \ + | tail -n +2 \ + | LC_ALL=C sort -k4,4 \ + > {output.grch37} + + qsv join --delimiter $'\t' \ + REF,ID {input.grch38} \ + REF_,ID_ {output.common_ids} \ + | qsv select CHROM,BEGIN,END,ID \ + | tr ',' '\t' \ + | tail -n +2 \ + | uniq \ + | LC_ALL=C sort -k4,4 \ + > {output.grch38} + """ diff --git a/misc/sites/select-xy/.gitattributes b/misc/sites/select-xy/.gitattributes new file mode 100644 index 0000000..cbf6593 --- /dev/null +++ b/misc/sites/select-xy/.gitattributes @@ -0,0 +1 @@ +data/Agilent_V6.bed filter=lfs diff=lfs merge=lfs -text diff --git a/misc/sites/select-xy/.gitignore b/misc/sites/select-xy/.gitignore new file mode 100644 index 0000000..f422783 --- /dev/null +++ b/misc/sites/select-xy/.gitignore @@ -0,0 +1,4 @@ +.snakemake/* +output/* +raw_data/* +work/* diff --git a/misc/sites/select-xy/Snakefile b/misc/sites/select-xy/Snakefile new file mode 100644 index 0000000..45f3e9b --- /dev/null +++ b/misc/sites/select-xy/Snakefile @@ -0,0 +1,145 @@ +envvars: + "PATH_DBSNP37", # GRCh37 dbSNP VCF + "PATH_DBSNP38", # GRCh38 dbSNP VCF + + +rule default: + input: + "output/common_ids.txt", + "output/grch37.common.bed", + "output/grch38.common.bed", + + +rule select_sites: + output: + "raw_data/selected/GRCh37_sitesX.bed", + params: + path_dbsnp=os.environ["PATH_DBSNP37"], + shell: + r""" + set -x + + (tabix \ + --print-header \ + --regions data/Agilent_V6.bed \ + {params.path_dbsnp} \ + || true) \ + | bcftools view -M 2 -i '(CAF != ".")' \ + | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%ID\t%CAF\n' \ + | tr ',' '\t' \ + | awk -F $'\t' 'BEGIN {{ + OFS=FS; + print "CHROM_", "BEGIN_", "END_", "REF_", "ALT_", "ID" + }} + ($6 <= 0.95) {{ + print $1, $2 - 1, $2, $3, $4, $5 + }}' \ + > {output} + """ + + +rule extract_dbsnp38_vcf: + input: + bed="raw_data/selected/GRCh37_sitesX.bed", + output: + vcf="work/dbsnp/GRCH38.vcf", + vcf_gz="work/dbsnp/GRCH38.vcf.gz", + tbi="work/dbsnp/GRCH38.vcf.gz.tbi", + params: + path_dbsnp=os.environ["PATH_DBSNP38"], + threads: 8 + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + tail -n +2 {input.bed} \ + | cut -f 6 \ + | sed -e 's/^/\t/' -e 's/$/\t/' \ + > $TMPDIR/rsids.txt + + tabix --only-header {params.path_dbsnp} \ + > {output.vcf} + parallel -k -j {threads} \ + 'tabix {params.path_dbsnp} {{}} | (fgrep -f $TMPDIR/rsids.txt || true)' ::: X Y \ + >> {output.vcf} + + bgzip -c {output.vcf} \ + > {output.vcf_gz} + tabix -f {output.vcf_gz} + """ + + +rule convert_dbsnp38_vcf_to_bed: + input: + vcf="work/dbsnp/GRCH38.vcf.gz", + output: + bed="work/label/grch38.bed", + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + set +o pipefail + + echo -e "CHROM\tBEGIN\tEND\tREF\tALT\tID" \ + > {output.bed} + bcftools sort -T $TMPDIR/sort.1.XXXXXX {input.vcf} \ + | bcftools norm -m -any \ + | bcftools query \ + -f '%CHROM\t%POS0\t%POS\t%REF\t%ALT\t%ID\n' \ + | sort -k1,1V -k2,2n \ + | uniq \ + >> {output.bed} + """ + + +rule extract_common_dbsnp_entries: + input: + grch37="raw_data/selected/GRCh37_sitesX.bed", + grch38="work/label/grch38.bed", + output: + common_ids="output/common_ids.txt", + grch37="output/grch37.common.bed", + grch38="output/grch38.common.bed", + shell: + r""" + set -x + + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + echo -e "REF_\tID_" \ + >{output.common_ids} + qsv join --delimiter $'\t' \ + REF_,ID {input.grch37} \ + REF,ID {input.grch38} \ + | qsv select REF,ID \ + | tr ',' '\t' \ + | uniq \ + | tail -n +2 \ + >> {output.common_ids} + + qsv join --delimiter $'\t' \ + REF_,ID {input.grch37} \ + REF_,ID_ {output.common_ids} \ + | qsv select CHROM_,BEGIN_,END_,ID \ + | tr ',' '\t' \ + | tail -n +2 \ + | LC_ALL=C sort -k4,4 \ + > {output.grch37} + + qsv join --delimiter $'\t' \ + REF,ID {input.grch38} \ + REF_,ID_ {output.common_ids} \ + | qsv select CHROM,BEGIN,END,ID \ + | tr ',' '\t' \ + | tail -n +2 \ + | uniq \ + | LC_ALL=C sort -k4,4 \ + > {output.grch38} + """ diff --git a/misc/sites/select-xy/data/Agilent_V6.bed b/misc/sites/select-xy/data/Agilent_V6.bed new file mode 100644 index 0000000..9e0f9f9 --- /dev/null +++ b/misc/sites/select-xy/data/Agilent_V6.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e476a794fba6ef5f5ce0212b87628877d3006a34552acbb0a33e4386910baa3 +size 176404 diff --git a/requirements/misc.txt b/requirements/misc.txt new file mode 100644 index 0000000..f3fe7c3 --- /dev/null +++ b/requirements/misc.txt @@ -0,0 +1,9 @@ +-r dev.txt + +snakemake +snakefmt +jupyterlab +scikit-learn +polars +matplotlib +plotly-express