-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep_input_coloc.R
103 lines (79 loc) · 3.55 KB
/
prep_input_coloc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
library(tidyverse)
library(readxl)
# functions
download_tbi <- function(ftp) {
ftptbi <- paste0(ftp, ".tbi")
idx_file <- file.path(getwd(), basename(ftptbi))
if (! file.exists(idx_file) ) download.file(ftptbi, dest = idx_file)
}
# eQTL catalogue tabix data
tabix_paths <-
"https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv" |>
read_tsv() |>
select(study = study_label, tissue = tissue_label, condition = condition_label,
qtl_group = sample_group, phenotype = quant_method, ftp_path)
imported_tabix_paths <-
"https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths_imported.tsv" |>
read_tsv() |>
select(study, tissue = tissue_label, condition = condition_label,
qtl_group, phenotype = quant_method, ftp_path)
paths_df <- bind_rows(tabix_paths, imported_tabix_paths) |>
filter(phenotype == "ge", study != "GTEx")
write_tsv(paths_df, "./data/coloc_inputs/eqtl_catalogue_paths.tsv")
# download index files
walk(paths_df$ftp_path, download_tbi)
# GWAS summ stats
gwas_stats <- read_excel("./data/gwas/TRAF1_region_summary_stats.xlsx")
# write bed for liftover
gwas_stats |>
select(start = pos_build36, snp = SNP) |>
mutate(chr = "chr9", end = start, start = start - 1) |>
select(chr, start, end, snp) |>
write_tsv("./data/gwas/TRAF1_region_hg18.bed", col_names = FALSE)
# read file after running liftover
gwas_stats_38 <- "./data/gwas/TRAF1_region_hg38.bed" |>
read_tsv(col_names = c("chr", "start", "end", "rsid")) |>
select(chr, pos = end, rsid) |>
left_join(gwas_stats, by = c("rsid" = "SNP"))
topsnp <- gwas_stats_38 |>
arrange(additive_pvalue) |>
slice(1) |>
pull(pos)
coords <- gwas_stats_38 |>
filter(between(pos, topsnp - 5e5, topsnp + 5e5)) |>
summarise(region = range(pos)) |>
pull(region)
region <- sprintf("%s:%d-%d", sub("chr", "", gwas_stats_38$chr[1]), coords[1], coords[2])
# Save region for eQTL catalogue query
write_lines(region, "./data/coloc_inputs/region.txt")
gwas_df <- gwas_stats_38 |>
filter(between(pos, coords[1], coords[2])) |>
select(rsid, pos, n_case, n_ctrl, or = additive_odds_ratio, pvalue = additive_pvalue) |>
mutate(log_or = log(or),
varbeta = (log_or^2) / qchisq(pvalue, df = 1, lower = FALSE),
type = "cc",
s = n_case/n_ctrl) |>
select(rsid, pos, log_or, varbeta, type, s)
gwas_min_df <- gwas_df |>
select(rsid, position = pos, gwas_beta = log_or, gwas_varbeta = varbeta)
# Write minimum GWAS dataset for coloc
write_tsv(gwas_min_df, "./data/coloc_inputs/gwas_data.tsv")
# Select genes
annotations <-
file.path("/lab-share/IM-Gutierrez-e2/Public/References/Annotations/hsapiens",
"gencode.v30.primary_assembly.annotation.gtf.gz") |>
read_tsv(comment = "#", col_types = "c-cii-c-c",
col_names = c("chr", "feature", "start", "end", "strand", "info"))
bed <- annotations |>
filter(chr == "chr9", feature == "gene") |>
mutate(tss = case_when(strand == "+" ~ start,
strand == "-" ~ end,
TRUE ~ NA_integer_)) |>
filter(between(tss, topsnp - 2.5e5, topsnp + 2.5e5)) |>
mutate(gene_id = str_extract(info, "(?<=gene_id\\s\")[^\"]+"),
gene_name = str_extract(info, "(?<=gene_name\\s\")[^\"]+"),
gene_id = sub("^(ENSG\\d+)\\.\\d+((_PAR_Y)?)$", "\\1\\2", gene_id)) |>
select(chr, start, end, gene_id, gene_name)
gene_ids <- select(bed, gene_id, gene_name)
# Save gene IDs for eQTL catalogue filtering
write_tsv(gene_ids, "./data/coloc_inputs/gene_annots.tsv")