Skip to content

Commit

Permalink
feat: updating sites files for stable RSIDs
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Jan 5, 2024
1 parent c5497aa commit d5d668d
Show file tree
Hide file tree
Showing 15 changed files with 660 additions and 0 deletions.
Binary file modified chew/data/GRCh37_sites.bed.gz
Binary file not shown.
Binary file modified chew/data/GRCh37_sitesX.bed.gz
Binary file not shown.
Binary file modified chew/data/GRCh38_sites.bed.gz
Binary file not shown.
Binary file modified chew/data/GRCh38_sitesX.bed.gz
Binary file not shown.
32 changes: 32 additions & 0 deletions misc/sites/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# NGS Chew Sites Related Computation

This directory contains the documentation on the `ngs-chew` sites are computed and verified as well as the generation of the PCA projection.

## Site Selection

The details can be found in the `select-auto` and `select-xy` directories.
To rerun it, obtain dbSNP GRCh37/GRCh38 files and `pip -r install /requirements/misc.txt`.
Then run the following:

```
PATH_DBSNP37=path/to/dbsnp-b151-grch37.vcf.gz \
PATH_DBSNP38=path/to/dbsnp-b151-grch38.vcf.gz \
snakemake -c -p work/label/grch37.common.bed
```

## Autosomal Site Selection

We base the selection of our autosomal sites on the GRCh37 sites from [peddy](https://github.com/brentp/peddy).
However, we limit the sites to those that have an RSID and are also in GRCh38.
The sites are in the order of the rsid.
This allows to re-use the fingerprints for both releases with only small limitations.

## Gonosomal Site Selection

See [this issue](https://github.com/bihealth/ngs-chew/issues/20) for the original selection of GRCh37 sites.
We use a similar approach as for autosomal sites.
However, these sites are only used for prediction of gonomosomal karyotypes and not fingerprints.

## Population Group Analysis

The folder `pca` contains a Snakemake workflow that infers population groups / clusters from the thousand genotypes data at our selected sites.
6 changes: 6 additions & 0 deletions misc/sites/pca/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/*.tbi
.snakemake/*
output/*
raw_data/*
work/*
by_id/*
132 changes: 132 additions & 0 deletions misc/sites/pca/PCA.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import PCA\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"import matplotlib.pyplot as plt\n",
"import plotly.express as px\n",
"import polars as pl\n",
"import pickle\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"labels = pl.read_csv(\n",
" \"raw_data/g1k/founders.tsv\",\n",
" separator=\" \",\n",
" has_header=False,\n",
")\n",
"labels.columns = (\"FAM\", \"NAME\", \"FATHER\", \"MOTHER\", \"SEX\", \"GROUP\", \"SUPERGROUP\")\n",
"groups = labels.get_column(\"GROUP\")\n",
"y = labels.get_column(\"SUPERGROUP\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_pl = pl.read_csv(\n",
" \"by_id/genotypes.tsv\",\n",
" separator=\"\\t\",\n",
" has_header=True,\n",
")\n",
"\n",
"X = data_pl.drop(\"CHROM\", \"POS\", \"ID\").fill_null(0).to_numpy()\n",
"#data_np.transpose()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pca = PCA(n_components=2)\n",
"pca.fit(X.transpose())\n",
"X_reduced = pca.transform(X.transpose())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('pca.pkl', 'wb') as pickle_file:\n",
" pickle.dump(pca, pickle_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.scatter(X_reduced, x=0, y=1, color=y)\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rf = RandomForestClassifier()\n",
"X_train, X_test, y_train, y_test = train_test_split(X_reduced, y)\n",
"rf.fit(X_train, y_train)\n",
"rf.predict_proba(X_test)\n",
"pca.transform(X_test.transpose())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rf.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "ngs-chew",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
106 changes: 106 additions & 0 deletions misc/sites/pca/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
rule obtain_calls:
output:
vcf="raw_data/g1k/genotypes.vcf",
params:
bed="../../../chew/data/GRCh38_sites.bed.gz",
threads: 100
shell:
r"""
set -x
url()
{{
echo http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr$1.recalibrated_variants.vcf.gz
}}
export -f url
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
tabix --only-header $(url 1) \
> {output.vcf}
for i in {{1..22}}; do
zgrep "^$i\s" {params.bed} \
| sed -e 's/^/chr/' \
| sort -k2,2n \
> $TMPDIR/chr.$i.bed
split -l 10 $TMPDIR/chr.$i.bed $TMPDIR/chr.$i.bed-
done
parallel -k -j {threads} 'i=$(basename {{}} | cut -d . -f 2); tabix -R {{}} $(url $i)' ::: $TMPDIR/chr.*.bed-* \
>> {output.vcf}
"""


rule obtain_pedigrees:
output:
txt="raw_data/g1k/20130606_g1k_3202_samples_ped_population.tsv",
txt_founders="raw_data/g1k/founders.tsv",
shell:
r"""
wget -O {output.txt} \
http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt
awk -F ' ' '
((NR > 1) && ($3 == 0) && ($4 == 0))' \
{output.txt} \
> {output.txt_founders}
"""


rule vcf_to_tsv:
input:
# vcf="raw_data/g1k/genotypes.vcf",
txt_founders="raw_data/g1k/founders.tsv",
output:
tsv="raw_data/g1k/genotypes.tsv",
shell:
r"""
(
echo -en "CHROM\tPOS\tREF\tALT\t"; \
cut -d ' ' -f 2 raw_data/g1k/founders.tsv | tr '\n' '\t' | sed -e 's/\t$//g'; \
echo; \
) > {output.tsv}
set +o pipefail
bcftools +missing2ref raw_data/g1k/genotypes.vcf \
| bcftools view -S <(cut -d ' ' -f 2 raw_data/g1k/founders.tsv) -i '(type="snp")' \
| bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' - \
| sed -e 's|[2-9]/|1/|g' \
| sed -e 's|/[2-9]|/1|g' \
| sed -e 's|,.||g' \
| sed -e 's|,.||g' \
| sed -e 's|,.||g' \
| sed -e 's|0/0|0|g' \
| sed -e 's|0/1|1|g' \
| sed -e 's|1/0|1|g' \
| sed -e 's|1/1|2|g' \
>> {output.tsv}
"""

rule join_tsv:
input:
tsv="raw_data/g1k/genotypes.tsv",
output:
by_id_tsv="by_id/ids.tsv",
tsv="by_id/genotypes.tsv",
params:
bed="../../../chew/data/GRCh38_sites.bed.gz",
shell:
r"""
echo -e "CHROM_\tPOS0_\tPOS_\tID" \
> {output.by_id_tsv}
zcat {params.bed} \
| sed -e 's/^/chr/' \
>> {output.by_id_tsv}
cp {output.by_id_tsv} {output.by_id_tsv}.bak
qsv join --delimiter $'\t' --left \
CHROM_,POS_ {output.by_id_tsv} \
CHROM,POS {input.tsv} \
| qsv select '!CHROM,POS0_,POS,REF,ALT' - \
| sed -e 's/CHROM_/CHROM/g' -e 's/POS_/POS/g' \
| tr , '\t' \
> {output.tsv}
"""
4 changes: 4 additions & 0 deletions misc/sites/select-auto/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.snakemake/*
output/*
raw_data/*
work/*
Loading

0 comments on commit d5d668d

Please sign in to comment.