Adds full GP functionality

Some further questions: - [ ] should we delete temp files after processing? - [ ] is using the replicate ids the right way to go? Some TODOs: - [ ] documentation for dependency management (probably a requirements.txt and a bash script to install using pip) Addresses issue #11
bhklab · Feb 21, 2017 · 9f9ae61 · 9f9ae61
1 parent 0110646
commit 9f9ae61
Show file tree

Hide file tree

Showing 13 changed files with 346 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,9 @@
 .Rhistory
 .RData
 .Ruserdata
+
+# temporary files for GPs
+python/temp*
+
+# Python
+__pycache__
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,4 +19,5 @@ Imports:
   ggplot2,
   ComplexHeatmap,
   reshape2,
-  grDevices
+  grDevices,
+  rPython
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,7 +8,9 @@ export(computemRECIST)
 export(creatXevaSet)
 export(drugWaterfall)
 export(experimentDesignSummary)
+export(fitGP)
 export(getExpDesign)
+export(getGPStatistics)
 export(getModels)
 export(getMolecularProfiles)
 export(pasteColTogather)

diff --git a/R/gp_functions.R b/R/gp_functions.R
@@ -0,0 +1,141 @@
+##------------------------------------------------------------------------------------------
+##------------------------------------------------------------------------------------------
+#' Fit a GP using Python's GPy package.
+#'
+#' \code{fitGP} fit a GP using Python's GPy package
+#'
+#' @param replicates A vector of multiple replicates that user would like to fit GP on
+#' @param source Data source
+#'
+#' @return results A list of time, predicted mean, and variance of fit GP
+#'
+#' @examples
+#' fitGP(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3", "PHLC111_P7.706.B1", "PHLC111_P7.708.B3"), lpdx)
+#' fitGP(c("PHLC111_P7.702.A2", "PHLC111_P7.704.A4", "PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx)
+#' @export
+fitGP <- function(replicates, source) {
+  times <- c()
+  volumes <- c()
+
+  max_time_index <- 0
+  max_volume_index <- 0
+
+  for (i in 1:length(replicates)) {
+    df <- getExperiment(source, replicates[i])
+    times[[i]] <- df$time
+    volumes[[i]] <- df$volume
+
+    if (length(df$time) > max_time_index) {
+      max_time_index <- length(df$time)
+    }
+
+    if (length(df$volume) > max_volume_index) {
+      max_volume_index <- length(df$volume)
+    }
+  }
+
+  # reformat times and volumes
+  for (i in 1:length(times)) {
+    times[[i]] <- c(times[[i]], rep(NA, max_time_index - length(times[[i]])))
+    volumes[[i]] <- c(volumes[[i]], rep(NA, max_volume_index - length(volumes[[i]])))
+  }
+
+  # write all to tempfiles
+  write.table(data.frame(lapply(t(times), as.numeric)),
+              file="python/temptime.csv", row.names = FALSE, col.names = FALSE)
+  write.table(data.frame(lapply(volumes, as.numeric), stringsAsFactors = FALSE),
+              file="python/tempvolume.csv", row.names = FALSE, col.names = FALSE)
+
+  # run python script
+  system("python3 python/fit_single_gp.py")
+
+  # get results
+  results = read.csv("python/tempresults.txt", sep = " ")
+
+  return(results)
+}
+
+##------------------------------------------------------------------------------------------
+##------------------------------------------------------------------------------------------
+#' Get KL divergence of two GPs
+#'
+#' \code{getGPStatistics} Get the KL divergence of two GPs.
+#'
+#' @param controlReplicates Vector of control replicates
+#' @param caseReplicates Vector of case replicates
+#' @param source Data source of replicates
+#'
+#' @return  Returns the KL divergence value for two GPs.
+#'
+#' @examples
+#' getGPStatistics(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3"), c("PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx)
+#'
+#' @export
+getGPStatistics <- function(controlReplicates, caseReplicates, source) {
+  control_times <- c()
+  case_times <- c()
+
+  control_volumes <- c()
+  case_volumes <- c()
+
+  max_time_index <- 0
+  max_volume_index <- 0
+
+  for (i in 1:length(controlReplicates)) {
+    df <- getExperiment(source, controlReplicates[i])
+    control_times[[i]] <- df$time
+    control_volumes[[i]] <- df$volume
+
+    if (length(df$time) > max_time_index) {
+      max_time_index <- length(df$time)
+    }
+
+    if (length(df$volume) > max_volume_index) {
+      max_volume_index <- length(df$volume)
+    }
+  }
+
+  for (i in 1:length(caseReplicates)) {
+    df <- getExperiment(source, caseReplicates[i])
+    case_times[[i]] <- df$time
+    case_volumes[[i]] <- df$volume
+
+    if (length(df$time) > max_time_index) {
+      max_time_index <- length(df$time)
+    }
+
+    if (length(df$volume) > max_volume_index) {
+      max_volume_index <- length(df$volume)
+    }
+  }
+
+  # reformat times and volumes
+  for (i in 1:length(control_times)) {
+    control_times[[i]] <- c(control_times[[i]], rep(NA, max_time_index - length(control_times[[i]])))
+    control_volumes[[i]] <- c(control_volumes[[i]], rep(NA, max_volume_index - length(control_volumes[[i]])))
+  }
+
+  # reformat times and volumes
+  for (i in 1:length(case_times)) {
+    case_times[[i]] <- c(case_times[[i]], rep(NA, max_time_index - length(case_times[[i]])))
+    case_volumes[[i]] <- c(case_volumes[[i]], rep(NA, max_volume_index - length(case_volumes[[i]])))
+  }
+
+  # write all to tempfiles
+  write.table(data.frame(lapply(t(control_times), as.numeric)),
+              file="python/temp_control_time.csv", row.names = FALSE, col.names = FALSE)
+  write.table(data.frame(lapply(control_volumes, as.numeric), stringsAsFactors = FALSE),
+              file="python/temp_control_volume.csv", row.names = FALSE, col.names = FALSE)
+  write.table(data.frame(lapply(t(case_times), as.numeric)),
+              file="python/temp_case_time.csv", row.names = FALSE, col.names = FALSE)
+  write.table(data.frame(lapply(case_volumes, as.numeric), stringsAsFactors = FALSE),
+              file="python/temp_case_volume.csv", row.names = FALSE, col.names = FALSE)
+
+  # run python script
+  system("python3 python/two_gp_statistics.py")
+
+  # get results
+  results = read.csv("python/temp_statistics_results.txt", sep = " ")
+
+  return(results)
+}
diff --git a/man/fitGP.Rd b/man/fitGP.Rd
diff --git a/man/getGPStatistics.Rd b/man/getGPStatistics.Rd
diff --git a/python/README.md b/python/README.md
@@ -0,0 +1,3 @@
+# Python Gaussian Processes
+
+Docs to come...
diff --git a/python/__init__.py b/python/__init__.py
diff --git a/python/fit_single_gp.py b/python/fit_single_gp.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import pandas as pd
+import numpy as np
+
+from gp import normalize_data, fit_gaussian_process
+
+if __name__ == '__main__':
+
+    # import CSV files
+    df_time = pd.read_csv('python/temptime.csv', delimiter=" ", header=None).dropna()
+    df_volume = pd.read_csv('python/tempvolume.csv', delimiter=" ", header=None).dropna()
+
+    # Replace any zeros in volume size with something that won't error out
+    df_volume.replace(0, 0.000001, inplace=True)
+    x, y, y_norm = normalize_data(df_time, df_volume)
+
+    fit_gp = fit_gaussian_process(x, y_norm, len(df_time.columns))
+
+    # Print out results of GP
+    increment_by = 0.25
+    predict_x = np.arange(0, max(x), increment_by)
+
+    with open("python/tempresults.txt", "w") as f:
+        print("x", "prediction", "variance", file=f)
+        for i in predict_x:
+            prediction_at_i = fit_gp.predict(np.asarray([[i]]))
+            print(i, prediction_at_i[0][0][0], prediction_at_i[1][0][0], file=f)
diff --git a/python/gp.py b/python/gp.py
@@ -0,0 +1,85 @@
+from GPy.models import GPRegression
+from GPy.kern import RBF
+import numpy as np
+
+from scipy.integrate import quad
+from scipy.stats import norm
+
+
+def normalize_data(X, Y):
+    """
+    Normalizes all growths using normalize_first_day_and_log_transform helper function.
+
+    :param X: Pandas dataframe of times
+    :param Y: Pandas dataframe of volumes
+    :return:
+    """
+    X = X[0]
+    x = np.asarray([[day] for day in X])
+    y = np.asarray([[size for size in Y[replicate]] for replicate in Y])
+    y_norm = __normalize_first_day_and_log_transform(y)
+
+    return x, y, y_norm
+
+def __normalize_first_day_and_log_transform(y):
+    """
+    Normalize by dividing every y element-wise by the first day's median
+    and then taking the log.
+
+    :param y:
+    :return:
+    """
+    if y.ndim == 1:
+        return np.log(y / np.median(y[0]))
+    else:
+        return np.log(y / np.median(y.T[0]))
+
+
+def fit_gaussian_process(x, y_norm, num_replicates):
+    """
+
+    :param x: Numpy array of times
+    :param y_norm: Numpy array of normalized tumor volumes
+    :param num_replicates: i.e. Number of columns in DF
+    :return:
+    """
+
+    kernel = RBF(input_dim=1, variance=1., lengthscale=10.)
+
+    x = np.tile(x, (num_replicates, 1))
+    y = np.resize(y_norm, (y_norm.shape[0] * y_norm.shape[1], 1))
+
+    gp = GPRegression(x, y, kernel)
+    gp.optimize_restarts(num_restarts=9, messages=False)
+
+    return gp
+
+def calculate_kl_divergence(control_x, case_x, gp_control, gp_case):
+    """
+    Calculates the KL divergence between the GPs fit for both the
+    batched controls and batched cases.
+
+    :param control_x:
+    :param case_x:
+    :param gp_control:
+    :param gp_case:
+    :return: kl_divergence
+    """
+
+    kl_divergence = None
+
+    def kl_integrand(t):
+        mean_control, var_control = gp_control.predict(np.asarray([[t]]))
+        mean_case, var_case = gp_case.predict(np.asarray([[t]]))
+
+        return np.log10(var_case / var_control) + ((var_control + (mean_control - mean_case) ** 2) / (2 * var_case))
+
+    # TODO: Need to replace zero with drug start day - this data needs to come in from Xeva
+    if len(control_x) > len(case_x):
+        kl_divergence = abs(quad(kl_integrand, 0, max(case_x))[0]
+                                 - max(case_x) / 2)[0]
+    else:
+        kl_divergence = abs(quad(kl_integrand, 0, max(control_x))[0]
+                                 - max(control_x) / 2)[0]
+
+    return kl_divergence
diff --git a/python/requirements.txt b/python/requirements.txt
@@ -0,0 +1,3 @@
+GPy==1.5.6
+numpy==1.12.0
+scipy==0.18.1
diff --git a/python/two_gp_statistics.py b/python/two_gp_statistics.py
@@ -0,0 +1,27 @@
+import pandas as pd
+import numpy as np
+
+from gp import normalize_data, fit_gaussian_process, calculate_kl_divergence
+
+if __name__ == '__main__':
+
+    # import CSV files
+    df_control_time = pd.read_csv('python/temp_control_time.csv', delimiter=" ", header=None).dropna()
+    df_case_time = pd.read_csv('python/temp_case_time.csv', delimiter=" ", header=None).dropna()
+    df_control_volume = pd.read_csv('python/temp_control_volume.csv', delimiter=" ", header=None).dropna()
+    df_case_volume = pd.read_csv('python/temp_case_volume.csv', delimiter=" ", header=None).dropna()
+
+    # Replace any zeros in volume size with something that won't error out
+    df_control_volume.replace(0, 0.000001, inplace=True)
+    df_case_volume.replace(0, 0.000001, inplace=True)
+
+    control_x, control_y, control_y_norm = normalize_data(df_control_time, df_control_volume)
+    case_x, case_y, case_y_norm = normalize_data(df_case_time, df_case_volume)
+
+    control_gp = fit_gaussian_process(control_x, control_y_norm, len(df_control_time.columns))
+    case_gp = fit_gaussian_process(case_x, case_y_norm, len(df_case_time.columns))
+
+    kl_divergence = calculate_kl_divergence(control_x, case_x, control_gp, case_gp)
+
+    with open("python/temp_statistics_results.txt", "w") as f:
+        print(kl_divergence, file=f)
diff --git a/vignettes/getting-started.pdf b/vignettes/getting-started.pdf