Skip to content

Commit

Permalink
Adds full GP functionality
Browse files Browse the repository at this point in the history
Some further questions:
- [ ] should we delete temp files after processing?
- [ ] is using the replicate ids the right way to go?

Some TODOs:
- [ ] documentation for dependency management (probably a
requirements.txt and a bash script to install using pip)

Addresses issue #11
  • Loading branch information
elijah-tai committed Feb 21, 2017
1 parent 0110646 commit 9f9ae61
Show file tree
Hide file tree
Showing 13 changed files with 346 additions and 1 deletion.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,9 @@
.Rhistory
.RData
.Ruserdata

# temporary files for GPs
python/temp*

# Python
__pycache__
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ Imports:
ggplot2,
ComplexHeatmap,
reshape2,
grDevices
grDevices,
rPython
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ export(computemRECIST)
export(creatXevaSet)
export(drugWaterfall)
export(experimentDesignSummary)
export(fitGP)
export(getExpDesign)
export(getGPStatistics)
export(getModels)
export(getMolecularProfiles)
export(pasteColTogather)
Expand Down
141 changes: 141 additions & 0 deletions R/gp_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
##------------------------------------------------------------------------------------------
##------------------------------------------------------------------------------------------
#' Fit a GP using Python's GPy package.
#'
#' \code{fitGP} fit a GP using Python's GPy package
#'
#' @param replicates A vector of multiple replicates that user would like to fit GP on
#' @param source Data source
#'
#' @return results A list of time, predicted mean, and variance of fit GP
#'
#' @examples
#' fitGP(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3", "PHLC111_P7.706.B1", "PHLC111_P7.708.B3"), lpdx)
#' fitGP(c("PHLC111_P7.702.A2", "PHLC111_P7.704.A4", "PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx)
#' @export
fitGP <- function(replicates, source) {
times <- c()
volumes <- c()

max_time_index <- 0
max_volume_index <- 0

for (i in 1:length(replicates)) {
df <- getExperiment(source, replicates[i])
times[[i]] <- df$time
volumes[[i]] <- df$volume

if (length(df$time) > max_time_index) {
max_time_index <- length(df$time)
}

if (length(df$volume) > max_volume_index) {
max_volume_index <- length(df$volume)
}
}

# reformat times and volumes
for (i in 1:length(times)) {
times[[i]] <- c(times[[i]], rep(NA, max_time_index - length(times[[i]])))
volumes[[i]] <- c(volumes[[i]], rep(NA, max_volume_index - length(volumes[[i]])))
}

# write all to tempfiles
write.table(data.frame(lapply(t(times), as.numeric)),
file="python/temptime.csv", row.names = FALSE, col.names = FALSE)
write.table(data.frame(lapply(volumes, as.numeric), stringsAsFactors = FALSE),
file="python/tempvolume.csv", row.names = FALSE, col.names = FALSE)

# run python script
system("python3 python/fit_single_gp.py")

# get results
results = read.csv("python/tempresults.txt", sep = " ")

return(results)
}

##------------------------------------------------------------------------------------------
##------------------------------------------------------------------------------------------
#' Get KL divergence of two GPs
#'
#' \code{getGPStatistics} Get the KL divergence of two GPs.
#'
#' @param controlReplicates Vector of control replicates
#' @param caseReplicates Vector of case replicates
#' @param source Data source of replicates
#'
#' @return Returns the KL divergence value for two GPs.
#'
#' @examples
#' getGPStatistics(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3"), c("PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx)
#'
#' @export
getGPStatistics <- function(controlReplicates, caseReplicates, source) {
control_times <- c()
case_times <- c()

control_volumes <- c()
case_volumes <- c()

max_time_index <- 0
max_volume_index <- 0

for (i in 1:length(controlReplicates)) {
df <- getExperiment(source, controlReplicates[i])
control_times[[i]] <- df$time
control_volumes[[i]] <- df$volume

if (length(df$time) > max_time_index) {
max_time_index <- length(df$time)
}

if (length(df$volume) > max_volume_index) {
max_volume_index <- length(df$volume)
}
}

for (i in 1:length(caseReplicates)) {
df <- getExperiment(source, caseReplicates[i])
case_times[[i]] <- df$time
case_volumes[[i]] <- df$volume

if (length(df$time) > max_time_index) {
max_time_index <- length(df$time)
}

if (length(df$volume) > max_volume_index) {
max_volume_index <- length(df$volume)
}
}

# reformat times and volumes
for (i in 1:length(control_times)) {
control_times[[i]] <- c(control_times[[i]], rep(NA, max_time_index - length(control_times[[i]])))
control_volumes[[i]] <- c(control_volumes[[i]], rep(NA, max_volume_index - length(control_volumes[[i]])))
}

# reformat times and volumes
for (i in 1:length(case_times)) {
case_times[[i]] <- c(case_times[[i]], rep(NA, max_time_index - length(case_times[[i]])))
case_volumes[[i]] <- c(case_volumes[[i]], rep(NA, max_volume_index - length(case_volumes[[i]])))
}

# write all to tempfiles
write.table(data.frame(lapply(t(control_times), as.numeric)),
file="python/temp_control_time.csv", row.names = FALSE, col.names = FALSE)
write.table(data.frame(lapply(control_volumes, as.numeric), stringsAsFactors = FALSE),
file="python/temp_control_volume.csv", row.names = FALSE, col.names = FALSE)
write.table(data.frame(lapply(t(case_times), as.numeric)),
file="python/temp_case_time.csv", row.names = FALSE, col.names = FALSE)
write.table(data.frame(lapply(case_volumes, as.numeric), stringsAsFactors = FALSE),
file="python/temp_case_volume.csv", row.names = FALSE, col.names = FALSE)

# run python script
system("python3 python/two_gp_statistics.py")

# get results
results = read.csv("python/temp_statistics_results.txt", sep = " ")

return(results)
}
24 changes: 24 additions & 0 deletions man/fitGP.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/getGPStatistics.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Python Gaussian Processes

Docs to come...
Empty file added python/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions python/fit_single_gp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python
import pandas as pd
import numpy as np

from gp import normalize_data, fit_gaussian_process

if __name__ == '__main__':

# import CSV files
df_time = pd.read_csv('python/temptime.csv', delimiter=" ", header=None).dropna()
df_volume = pd.read_csv('python/tempvolume.csv', delimiter=" ", header=None).dropna()

# Replace any zeros in volume size with something that won't error out
df_volume.replace(0, 0.000001, inplace=True)
x, y, y_norm = normalize_data(df_time, df_volume)

fit_gp = fit_gaussian_process(x, y_norm, len(df_time.columns))

# Print out results of GP
increment_by = 0.25
predict_x = np.arange(0, max(x), increment_by)

with open("python/tempresults.txt", "w") as f:
print("x", "prediction", "variance", file=f)
for i in predict_x:
prediction_at_i = fit_gp.predict(np.asarray([[i]]))
print(i, prediction_at_i[0][0][0], prediction_at_i[1][0][0], file=f)
85 changes: 85 additions & 0 deletions python/gp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from GPy.models import GPRegression
from GPy.kern import RBF
import numpy as np

from scipy.integrate import quad
from scipy.stats import norm


def normalize_data(X, Y):
"""
Normalizes all growths using normalize_first_day_and_log_transform helper function.
:param X: Pandas dataframe of times
:param Y: Pandas dataframe of volumes
:return:
"""
X = X[0]
x = np.asarray([[day] for day in X])
y = np.asarray([[size for size in Y[replicate]] for replicate in Y])
y_norm = __normalize_first_day_and_log_transform(y)

return x, y, y_norm

def __normalize_first_day_and_log_transform(y):
"""
Normalize by dividing every y element-wise by the first day's median
and then taking the log.
:param y:
:return:
"""
if y.ndim == 1:
return np.log(y / np.median(y[0]))
else:
return np.log(y / np.median(y.T[0]))


def fit_gaussian_process(x, y_norm, num_replicates):
"""
:param x: Numpy array of times
:param y_norm: Numpy array of normalized tumor volumes
:param num_replicates: i.e. Number of columns in DF
:return:
"""

kernel = RBF(input_dim=1, variance=1., lengthscale=10.)

x = np.tile(x, (num_replicates, 1))
y = np.resize(y_norm, (y_norm.shape[0] * y_norm.shape[1], 1))

gp = GPRegression(x, y, kernel)
gp.optimize_restarts(num_restarts=9, messages=False)

return gp

def calculate_kl_divergence(control_x, case_x, gp_control, gp_case):
"""
Calculates the KL divergence between the GPs fit for both the
batched controls and batched cases.
:param control_x:
:param case_x:
:param gp_control:
:param gp_case:
:return: kl_divergence
"""

kl_divergence = None

def kl_integrand(t):
mean_control, var_control = gp_control.predict(np.asarray([[t]]))
mean_case, var_case = gp_case.predict(np.asarray([[t]]))

return np.log10(var_case / var_control) + ((var_control + (mean_control - mean_case) ** 2) / (2 * var_case))

# TODO: Need to replace zero with drug start day - this data needs to come in from Xeva
if len(control_x) > len(case_x):
kl_divergence = abs(quad(kl_integrand, 0, max(case_x))[0]
- max(case_x) / 2)[0]
else:
kl_divergence = abs(quad(kl_integrand, 0, max(control_x))[0]
- max(control_x) / 2)[0]

return kl_divergence
3 changes: 3 additions & 0 deletions python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
GPy==1.5.6
numpy==1.12.0
scipy==0.18.1
27 changes: 27 additions & 0 deletions python/two_gp_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd
import numpy as np

from gp import normalize_data, fit_gaussian_process, calculate_kl_divergence

if __name__ == '__main__':

# import CSV files
df_control_time = pd.read_csv('python/temp_control_time.csv', delimiter=" ", header=None).dropna()
df_case_time = pd.read_csv('python/temp_case_time.csv', delimiter=" ", header=None).dropna()
df_control_volume = pd.read_csv('python/temp_control_volume.csv', delimiter=" ", header=None).dropna()
df_case_volume = pd.read_csv('python/temp_case_volume.csv', delimiter=" ", header=None).dropna()

# Replace any zeros in volume size with something that won't error out
df_control_volume.replace(0, 0.000001, inplace=True)
df_case_volume.replace(0, 0.000001, inplace=True)

control_x, control_y, control_y_norm = normalize_data(df_control_time, df_control_volume)
case_x, case_y, case_y_norm = normalize_data(df_case_time, df_case_volume)

control_gp = fit_gaussian_process(control_x, control_y_norm, len(df_control_time.columns))
case_gp = fit_gaussian_process(case_x, case_y_norm, len(df_case_time.columns))

kl_divergence = calculate_kl_divergence(control_x, case_x, control_gp, case_gp)

with open("python/temp_statistics_results.txt", "w") as f:
print(kl_divergence, file=f)
Binary file modified vignettes/getting-started.pdf
Binary file not shown.

0 comments on commit 9f9ae61

Please sign in to comment.