-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_lm.R
136 lines (123 loc) · 4.73 KB
/
run_lm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#' Runs linear regression to estimate effect of polygenic risk SCORE (PRS) on gene expression
#' @description formula: Expr ~ PRS + Cov, where Cov = covariates
#'
#' @param expr Expression vector (numeric) with length = #samples
#' @param cov Regression covariates in form [cov x samples]
#' @param SCORE The PRS, not included in `cov`
#' @param method Choose between 'default' or 'two-stage' for lm() method (see desc. in support functions below)
#' @return A [1 x 8] vector output from an lm() like below:
#' ['intercept', 'beta', 'SE', 't_value', 'pval', 'beta.conf.low', 'beta.conf.high', 'corr.rho']
#'
#' @example using `pbapply::pblapply` to parallelize run_lm() over all genes
#' num.cores = 10
#' lm.res <-
#' pblapply(tx_expr, # Expression vector list for `pbapply::pblapply`
#' run_lm, # This function
#' cov = cov, # Covariate matrix, as desribed above
#' SCORE = SCORE, # PRS
#' method = 'two-stage',# Choose between 'default' or 'two-stage'
#' cl = num.cores) # Number of cores to parallelize over
#'
#' lm.res <- simplify2array(lm.res, higher=F)
#' rownames(lm.res) <-
#' c('intercept',
#' 'beta',
#' 'SE',
#' 't_value',
#' 'pval',
#' 'conf.low',
#' 'conf.high',
#' 'corr.rho')
#' colnames(lm.res) <- get.gene_id_stable(gene.ids)
#'
#' # Sort results by p-value
#' lm.res <- as.data.frame(t(lm.res))
#' lm_res.sort <- lm.res[order(lm.res$pval), ]
#'
#'
#' Author: Vamsee Pillalamarri
run_lm <- function(expr, cov, SCORE, method='default') {
res <- switch (method,
"default" = run_lm_default(expr, cov, SCORE),
"two-stage" = run_lm_two_stage(expr, cov, SCORE)
)
return(res)
}
# run_lm() support functions
run_lm_default <- function(expr, cov, SCORE) {
expr <- as.numeric(expr)
expr_cov <- cbind(SCORE, expr, cov)
# # Run lm() normal procedure
lm.fit <- lm(expr ~ ., data = expr_cov)
lm.fit.summary <- summary(lm.fit)
# print(lm.fit.summary)
# Get corr
cor_expr_score <-
cor(expr, SCORE)
# Capture p-val, etc.
lm.res_ <-
as.data.frame(t(coef(lm.fit.summary)['SCORE',]))
intercept <- coef(lm.fit)[1]
names(intercept) <- NULL
lm.res_ <-
cbind(data.frame(intercept),
lm.res_,
t(confint(lm.fit)['SCORE', ]),
cor_expr_score)
return(as.matrix(lm.res_))
}
run_lm_two_stage <- function(expr, cov, SCORE) {
expr <- as.numeric(expr)
expr_cov <- cbind(SCORE, expr, cov)
# Run lm with fully-adjusted 2 stage regression residual / adjustment procedure
# https://aeolister.wordpress.com/2016/07/21/regressing-out-a-covariate-is-problematic/
# https://stat.ethz.ch/pipermail/r-help/2005-April/068856.html
# Partly adjusted model issues: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3201714/
# model 1: lm(Expr ~ Covariates)
# model 2: lm(PRS ~ Covariates)
# model 3: either:
# lm(rank normalized residuals of model 1 ~ residuals of model 2) OR
# lm(residuals of model 1 ~ residuals of model 2)
# This captures the conditional effect of PRS on adjusted gene expression
lm.fit1 <- lm(expr ~ . - SCORE, data = expr_cov) # regress expr onto cov
lm.fit2 <- lm(SCORE ~ . - expr, data = expr_cov) # regress prs onto cov
resid.dat <- data.frame(expr=resid(lm.fit1),
SCORE=resid(lm.fit2)) # don't rank normalize adj. gene expression residuals
# resid.dat <- data.frame(expr=rankNorm(resid(lm.fit1)),
# SCORE=resid(lm.fit2)) # do rank normalize adj. gene expression residuals
lm.fit3 <- lm(expr ~ SCORE, data = resid.dat) # regress adjusted variables onto each other
lm.fit.3.summary <- summary(lm.fit3)
cor_expr_score <-
with(resid.dat, cor(expr, SCORE))
# Capture p-val, etc.
lm.res_ <-
as.data.frame(t(coef(lm.fit.3.summary)['SCORE',]))
intercept <- coef(lm.fit3)[1]
names(intercept) <- NULL
lm.res_ <-
cbind(data.frame(intercept),
lm.res_,
t(confint(lm.fit3)['SCORE', ]),
cor_expr_score)
return(as.matrix(lm.res_))
}
# OLD Code ----
# # Run lm with partly-adjusted 2-stage "regressing out" procedure (potentially incorrect)
# # Partly-adjusted model issues: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3201714/
# # lm.fit <- lm(expr ~ . - SCORE, data = expr_cov)
# # #lm.fit <- lm(expr ~ ., data = expr_cov)
# # #lm.fit.summary <- summary(lm.fit)
# # lm.fit2 <- lm(residuals(lm.fit) ~ SCORE)
# # lm.fit.summary <- summary(lm.fit2)
#
# # Capture p-val, etc.
# lm.res_ <-
# as.data.frame(t(coef(lm.fit.summary)['SCORE',]))
# intercept <- coef(lm.fit2)[1]
# names(intercept) <- NULL
# lm.res_ <-
# cbind(data.frame(intercept),
# lm.res_,
# t(confint(lm.fit2)['SCORE', ]),
# cor_expr_score)
# lm_res[k, ] <- lm.res_