feat: updating sites files for stable RSIDs

bihealth · Jan 5, 2024 · d5d668d · d5d668d
1 parent c5497aa
commit d5d668d
Show file tree

Hide file tree

Showing 15 changed files with 660 additions and 0 deletions.
diff --git a/chew/data/GRCh37_sites.bed.gz b/chew/data/GRCh37_sites.bed.gz
diff --git a/chew/data/GRCh37_sitesX.bed.gz b/chew/data/GRCh37_sitesX.bed.gz
diff --git a/chew/data/GRCh38_sites.bed.gz b/chew/data/GRCh38_sites.bed.gz
diff --git a/chew/data/GRCh38_sitesX.bed.gz b/chew/data/GRCh38_sitesX.bed.gz
diff --git a/misc/sites/README.md b/misc/sites/README.md
@@ -0,0 +1,32 @@
+# NGS Chew Sites Related Computation
+
+This directory contains the documentation on the `ngs-chew` sites are computed and verified as well as the generation of the PCA projection.
+
+## Site Selection
+
+The details can be found in the `select-auto` and `select-xy` directories.
+To rerun it, obtain dbSNP GRCh37/GRCh38 files and `pip -r install /requirements/misc.txt`.
+Then run the following:
+
+```
+PATH_DBSNP37=path/to/dbsnp-b151-grch37.vcf.gz \
+PATH_DBSNP38=path/to/dbsnp-b151-grch38.vcf.gz \
+    snakemake -c -p work/label/grch37.common.bed
+```
+
+## Autosomal Site Selection
+
+We base the selection of our autosomal sites on the GRCh37 sites from [peddy](https://github.com/brentp/peddy).
+However, we limit the sites to those that have an RSID and are also in GRCh38.
+The sites are in the order of the rsid.
+This allows to re-use the fingerprints for both releases with only small limitations.
+
+## Gonosomal Site Selection
+
+See [this issue](https://github.com/bihealth/ngs-chew/issues/20) for the original selection of GRCh37 sites.
+We use a similar approach as for autosomal sites.
+However, these sites are only used for prediction of gonomosomal karyotypes and not fingerprints.
+
+## Population Group Analysis
+
+The folder `pca` contains a Snakemake workflow that infers population groups / clusters from the thousand genotypes data at our selected sites.
diff --git a/misc/sites/pca/.gitignore b/misc/sites/pca/.gitignore
@@ -0,0 +1,6 @@
+/*.tbi
+.snakemake/*
+output/*
+raw_data/*
+work/*
+by_id/*
diff --git a/misc/sites/pca/PCA.ipynb b/misc/sites/pca/PCA.ipynb
@@ -0,0 +1,132 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import matplotlib.pyplot as plt\n",
+    "import plotly.express as px\n",
+    "import polars as pl\n",
+    "import pickle\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels = pl.read_csv(\n",
+    "    \"raw_data/g1k/founders.tsv\",\n",
+    "    separator=\" \",\n",
+    "    has_header=False,\n",
+    ")\n",
+    "labels.columns = (\"FAM\", \"NAME\", \"FATHER\", \"MOTHER\", \"SEX\", \"GROUP\", \"SUPERGROUP\")\n",
+    "groups = labels.get_column(\"GROUP\")\n",
+    "y = labels.get_column(\"SUPERGROUP\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_pl = pl.read_csv(\n",
+    "    \"by_id/genotypes.tsv\",\n",
+    "    separator=\"\\t\",\n",
+    "    has_header=True,\n",
+    ")\n",
+    "\n",
+    "X = data_pl.drop(\"CHROM\", \"POS\", \"ID\").fill_null(0).to_numpy()\n",
+    "#data_np.transpose()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pca = PCA(n_components=2)\n",
+    "pca.fit(X.transpose())\n",
+    "X_reduced = pca.transform(X.transpose())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('pca.pkl', 'wb') as pickle_file:\n",
+    "        pickle.dump(pca, pickle_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = px.scatter(X_reduced, x=0, y=1, color=y)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rf = RandomForestClassifier()\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_reduced, y)\n",
+    "rf.fit(X_train, y_train)\n",
+    "rf.predict_proba(X_test)\n",
+    "pca.transform(X_test.transpose())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rf.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ngs-chew",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/misc/sites/pca/Snakefile b/misc/sites/pca/Snakefile
@@ -0,0 +1,106 @@
+rule obtain_calls:
+    output:
+        vcf="raw_data/g1k/genotypes.vcf",
+    params:
+        bed="../../../chew/data/GRCh38_sites.bed.gz",
+    threads: 100
+    shell:
+        r"""
+        set -x
+
+        url()
+        {{
+            echo http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr$1.recalibrated_variants.vcf.gz
+        }}
+        export -f url
+
+        export TMPDIR=$(mktemp -d)
+        trap "rm -rf $TMPDIR" EXIT
+
+        tabix --only-header $(url 1) \
+        > {output.vcf}
+
+        for i in {{1..22}}; do
+            zgrep "^$i\s" {params.bed} \
+            | sed -e 's/^/chr/' \
+            | sort -k2,2n \
+            > $TMPDIR/chr.$i.bed
+            split -l 10 $TMPDIR/chr.$i.bed $TMPDIR/chr.$i.bed-
+        done
+
+        parallel -k -j {threads} 'i=$(basename {{}} | cut -d . -f 2); tabix -R {{}} $(url $i)' ::: $TMPDIR/chr.*.bed-* \
+        >> {output.vcf}
+        """
+
+
+rule obtain_pedigrees:
+    output:
+        txt="raw_data/g1k/20130606_g1k_3202_samples_ped_population.tsv",
+        txt_founders="raw_data/g1k/founders.tsv",
+    shell:
+        r"""
+        wget -O {output.txt} \
+            http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt
+
+        awk -F ' ' '
+            ((NR > 1) && ($3 == 0) && ($4 == 0))' \
+            {output.txt} \
+        > {output.txt_founders}
+        """
+
+
+rule vcf_to_tsv:
+    input:
+        # vcf="raw_data/g1k/genotypes.vcf",
+        txt_founders="raw_data/g1k/founders.tsv",
+    output:
+        tsv="raw_data/g1k/genotypes.tsv",
+    shell:
+        r"""
+        (
+            echo -en "CHROM\tPOS\tREF\tALT\t"; \
+            cut -d ' ' -f 2 raw_data/g1k/founders.tsv | tr '\n' '\t' | sed -e 's/\t$//g'; \
+            echo; \
+        ) > {output.tsv}
+
+        set +o pipefail
+        bcftools +missing2ref raw_data/g1k/genotypes.vcf \
+        | bcftools view -S <(cut -d ' ' -f 2 raw_data/g1k/founders.tsv) -i '(type="snp")' \
+        | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' - \
+        | sed -e 's|[2-9]/|1/|g' \
+        | sed -e 's|/[2-9]|/1|g' \
+        | sed -e 's|,.||g' \
+        | sed -e 's|,.||g' \
+        | sed -e 's|,.||g' \
+        | sed -e 's|0/0|0|g' \
+        | sed -e 's|0/1|1|g' \
+        | sed -e 's|1/0|1|g' \
+        | sed -e 's|1/1|2|g' \
+        >> {output.tsv}
+        """
+
+rule join_tsv:
+    input:
+        tsv="raw_data/g1k/genotypes.tsv",
+    output:
+        by_id_tsv="by_id/ids.tsv",
+        tsv="by_id/genotypes.tsv",
+    params:
+        bed="../../../chew/data/GRCh38_sites.bed.gz",
+    shell:
+        r"""
+        echo -e "CHROM_\tPOS0_\tPOS_\tID" \
+        > {output.by_id_tsv}
+        zcat {params.bed} \
+        | sed -e 's/^/chr/' \
+        >> {output.by_id_tsv}
+        cp  {output.by_id_tsv} {output.by_id_tsv}.bak
+
+        qsv join --delimiter $'\t' --left \
+            CHROM_,POS_ {output.by_id_tsv} \
+            CHROM,POS {input.tsv} \
+        | qsv select '!CHROM,POS0_,POS,REF,ALT' - \
+        | sed -e 's/CHROM_/CHROM/g' -e 's/POS_/POS/g' \
+        | tr , '\t' \
+        > {output.tsv}
+        """
diff --git a/misc/sites/select-auto/.gitignore b/misc/sites/select-auto/.gitignore
@@ -0,0 +1,4 @@
+.snakemake/*
+output/*
+raw_data/*
+work/*