diff --git a/.gitignore b/.gitignore index 5b6a065..c801a94 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,9 @@ .Rhistory .RData .Ruserdata + +# temporary files for GPs +python/temp* + +# Python +__pycache__ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 49694c5..78bfc66 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,4 +19,5 @@ Imports: ggplot2, ComplexHeatmap, reshape2, - grDevices + grDevices, + rPython diff --git a/NAMESPACE b/NAMESPACE index 089c92a..5fcf5ba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,9 @@ export(computemRECIST) export(creatXevaSet) export(drugWaterfall) export(experimentDesignSummary) +export(fitGP) export(getExpDesign) +export(getGPStatistics) export(getModels) export(getMolecularProfiles) export(pasteColTogather) diff --git a/R/gp_functions.R b/R/gp_functions.R new file mode 100644 index 0000000..87cd162 --- /dev/null +++ b/R/gp_functions.R @@ -0,0 +1,141 @@ +##------------------------------------------------------------------------------------------ +##------------------------------------------------------------------------------------------ +#' Fit a GP using Python's GPy package. +#' +#' \code{fitGP} fit a GP using Python's GPy package +#' +#' @param replicates A vector of multiple replicates that user would like to fit GP on +#' @param source Data source +#' +#' @return results A list of time, predicted mean, and variance of fit GP +#' +#' @examples +#' fitGP(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3", "PHLC111_P7.706.B1", "PHLC111_P7.708.B3"), lpdx) +#' fitGP(c("PHLC111_P7.702.A2", "PHLC111_P7.704.A4", "PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx) +#' @export +fitGP <- function(replicates, source) { + times <- c() + volumes <- c() + + max_time_index <- 0 + max_volume_index <- 0 + + for (i in 1:length(replicates)) { + df <- getExperiment(source, replicates[i]) + times[[i]] <- df$time + volumes[[i]] <- df$volume + + if (length(df$time) > max_time_index) { + max_time_index <- length(df$time) + } + + if (length(df$volume) > max_volume_index) { + max_volume_index <- length(df$volume) + } + } + + # reformat times and volumes + for (i in 1:length(times)) { + times[[i]] <- c(times[[i]], rep(NA, max_time_index - length(times[[i]]))) + volumes[[i]] <- c(volumes[[i]], rep(NA, max_volume_index - length(volumes[[i]]))) + } + + # write all to tempfiles + write.table(data.frame(lapply(t(times), as.numeric)), + file="python/temptime.csv", row.names = FALSE, col.names = FALSE) + write.table(data.frame(lapply(volumes, as.numeric), stringsAsFactors = FALSE), + file="python/tempvolume.csv", row.names = FALSE, col.names = FALSE) + + # run python script + system("python3 python/fit_single_gp.py") + + # get results + results = read.csv("python/tempresults.txt", sep = " ") + + return(results) +} + +##------------------------------------------------------------------------------------------ +##------------------------------------------------------------------------------------------ +#' Get KL divergence of two GPs +#' +#' \code{getGPStatistics} Get the KL divergence of two GPs. +#' +#' @param controlReplicates Vector of control replicates +#' @param caseReplicates Vector of case replicates +#' @param source Data source of replicates +#' +#' @return Returns the KL divergence value for two GPs. +#' +#' @examples +#' getGPStatistics(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3"), c("PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx) +#' +#' @export +getGPStatistics <- function(controlReplicates, caseReplicates, source) { + control_times <- c() + case_times <- c() + + control_volumes <- c() + case_volumes <- c() + + max_time_index <- 0 + max_volume_index <- 0 + + for (i in 1:length(controlReplicates)) { + df <- getExperiment(source, controlReplicates[i]) + control_times[[i]] <- df$time + control_volumes[[i]] <- df$volume + + if (length(df$time) > max_time_index) { + max_time_index <- length(df$time) + } + + if (length(df$volume) > max_volume_index) { + max_volume_index <- length(df$volume) + } + } + + for (i in 1:length(caseReplicates)) { + df <- getExperiment(source, caseReplicates[i]) + case_times[[i]] <- df$time + case_volumes[[i]] <- df$volume + + if (length(df$time) > max_time_index) { + max_time_index <- length(df$time) + } + + if (length(df$volume) > max_volume_index) { + max_volume_index <- length(df$volume) + } + } + + # reformat times and volumes + for (i in 1:length(control_times)) { + control_times[[i]] <- c(control_times[[i]], rep(NA, max_time_index - length(control_times[[i]]))) + control_volumes[[i]] <- c(control_volumes[[i]], rep(NA, max_volume_index - length(control_volumes[[i]]))) + } + + # reformat times and volumes + for (i in 1:length(case_times)) { + case_times[[i]] <- c(case_times[[i]], rep(NA, max_time_index - length(case_times[[i]]))) + case_volumes[[i]] <- c(case_volumes[[i]], rep(NA, max_volume_index - length(case_volumes[[i]]))) + } + + # write all to tempfiles + write.table(data.frame(lapply(t(control_times), as.numeric)), + file="python/temp_control_time.csv", row.names = FALSE, col.names = FALSE) + write.table(data.frame(lapply(control_volumes, as.numeric), stringsAsFactors = FALSE), + file="python/temp_control_volume.csv", row.names = FALSE, col.names = FALSE) + write.table(data.frame(lapply(t(case_times), as.numeric)), + file="python/temp_case_time.csv", row.names = FALSE, col.names = FALSE) + write.table(data.frame(lapply(case_volumes, as.numeric), stringsAsFactors = FALSE), + file="python/temp_case_volume.csv", row.names = FALSE, col.names = FALSE) + + # run python script + system("python3 python/two_gp_statistics.py") + + # get results + results = read.csv("python/temp_statistics_results.txt", sep = " ") + + return(results) +} diff --git a/man/fitGP.Rd b/man/fitGP.Rd new file mode 100644 index 0000000..2e74bd2 --- /dev/null +++ b/man/fitGP.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gp_functions.R +\name{fitGP} +\alias{fitGP} +\title{Fit a GP using Python's GPy package.} +\usage{ +fitGP(replicates, source) +} +\arguments{ +\item{replicates}{A vector of multiple replicates that user would like to fit GP on} + +\item{source}{Data source} +} +\value{ +results A list of time, predicted mean, and variance of fit GP +} +\description{ +\code{fitGP} fit a GP using Python's GPy package +} +\examples{ +fitGP(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3", "PHLC111_P7.706.B1", "PHLC111_P7.708.B3"), lpdx) +fitGP(c("PHLC111_P7.702.A2", "PHLC111_P7.704.A4", "PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx) +} + diff --git a/man/getGPStatistics.Rd b/man/getGPStatistics.Rd new file mode 100644 index 0000000..d7b5e0c --- /dev/null +++ b/man/getGPStatistics.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gp_functions.R +\name{getGPStatistics} +\alias{getGPStatistics} +\title{Get KL divergence of two GPs} +\usage{ +getGPStatistics(controlReplicates, caseReplicates, source) +} +\arguments{ +\item{controlReplicates}{Vector of control replicates} + +\item{caseReplicates}{Vector of case replicates} + +\item{source}{Data source of replicates} +} +\value{ +Returns the KL divergence value for two GPs. +} +\description{ +\code{getGPStatistics} Get the KL divergence of two GPs. +} +\examples{ +getStatistics(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3"), c("PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx) + +} + diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..4d6758f --- /dev/null +++ b/python/README.md @@ -0,0 +1,3 @@ +# Python Gaussian Processes + +Docs to come... \ No newline at end of file diff --git a/python/__init__.py b/python/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/fit_single_gp.py b/python/fit_single_gp.py new file mode 100644 index 0000000..8258001 --- /dev/null +++ b/python/fit_single_gp.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +import pandas as pd +import numpy as np + +from gp import normalize_data, fit_gaussian_process + +if __name__ == '__main__': + + # import CSV files + df_time = pd.read_csv('python/temptime.csv', delimiter=" ", header=None).dropna() + df_volume = pd.read_csv('python/tempvolume.csv', delimiter=" ", header=None).dropna() + + # Replace any zeros in volume size with something that won't error out + df_volume.replace(0, 0.000001, inplace=True) + x, y, y_norm = normalize_data(df_time, df_volume) + + fit_gp = fit_gaussian_process(x, y_norm, len(df_time.columns)) + + # Print out results of GP + increment_by = 0.25 + predict_x = np.arange(0, max(x), increment_by) + + with open("python/tempresults.txt", "w") as f: + print("x", "prediction", "variance", file=f) + for i in predict_x: + prediction_at_i = fit_gp.predict(np.asarray([[i]])) + print(i, prediction_at_i[0][0][0], prediction_at_i[1][0][0], file=f) diff --git a/python/gp.py b/python/gp.py new file mode 100644 index 0000000..eedb255 --- /dev/null +++ b/python/gp.py @@ -0,0 +1,85 @@ +from GPy.models import GPRegression +from GPy.kern import RBF +import numpy as np + +from scipy.integrate import quad +from scipy.stats import norm + + +def normalize_data(X, Y): + """ + Normalizes all growths using normalize_first_day_and_log_transform helper function. + + :param X: Pandas dataframe of times + :param Y: Pandas dataframe of volumes + :return: + """ + X = X[0] + x = np.asarray([[day] for day in X]) + y = np.asarray([[size for size in Y[replicate]] for replicate in Y]) + y_norm = __normalize_first_day_and_log_transform(y) + + return x, y, y_norm + +def __normalize_first_day_and_log_transform(y): + """ + Normalize by dividing every y element-wise by the first day's median + and then taking the log. + + :param y: + :return: + """ + if y.ndim == 1: + return np.log(y / np.median(y[0])) + else: + return np.log(y / np.median(y.T[0])) + + +def fit_gaussian_process(x, y_norm, num_replicates): + """ + + :param x: Numpy array of times + :param y_norm: Numpy array of normalized tumor volumes + :param num_replicates: i.e. Number of columns in DF + :return: + """ + + kernel = RBF(input_dim=1, variance=1., lengthscale=10.) + + x = np.tile(x, (num_replicates, 1)) + y = np.resize(y_norm, (y_norm.shape[0] * y_norm.shape[1], 1)) + + gp = GPRegression(x, y, kernel) + gp.optimize_restarts(num_restarts=9, messages=False) + + return gp + +def calculate_kl_divergence(control_x, case_x, gp_control, gp_case): + """ + Calculates the KL divergence between the GPs fit for both the + batched controls and batched cases. + + :param control_x: + :param case_x: + :param gp_control: + :param gp_case: + :return: kl_divergence + """ + + kl_divergence = None + + def kl_integrand(t): + mean_control, var_control = gp_control.predict(np.asarray([[t]])) + mean_case, var_case = gp_case.predict(np.asarray([[t]])) + + return np.log10(var_case / var_control) + ((var_control + (mean_control - mean_case) ** 2) / (2 * var_case)) + + # TODO: Need to replace zero with drug start day - this data needs to come in from Xeva + if len(control_x) > len(case_x): + kl_divergence = abs(quad(kl_integrand, 0, max(case_x))[0] + - max(case_x) / 2)[0] + else: + kl_divergence = abs(quad(kl_integrand, 0, max(control_x))[0] + - max(control_x) / 2)[0] + + return kl_divergence diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..e4aa978 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,3 @@ +GPy==1.5.6 +numpy==1.12.0 +scipy==0.18.1 \ No newline at end of file diff --git a/python/two_gp_statistics.py b/python/two_gp_statistics.py new file mode 100644 index 0000000..d119ca3 --- /dev/null +++ b/python/two_gp_statistics.py @@ -0,0 +1,27 @@ +import pandas as pd +import numpy as np + +from gp import normalize_data, fit_gaussian_process, calculate_kl_divergence + +if __name__ == '__main__': + + # import CSV files + df_control_time = pd.read_csv('python/temp_control_time.csv', delimiter=" ", header=None).dropna() + df_case_time = pd.read_csv('python/temp_case_time.csv', delimiter=" ", header=None).dropna() + df_control_volume = pd.read_csv('python/temp_control_volume.csv', delimiter=" ", header=None).dropna() + df_case_volume = pd.read_csv('python/temp_case_volume.csv', delimiter=" ", header=None).dropna() + + # Replace any zeros in volume size with something that won't error out + df_control_volume.replace(0, 0.000001, inplace=True) + df_case_volume.replace(0, 0.000001, inplace=True) + + control_x, control_y, control_y_norm = normalize_data(df_control_time, df_control_volume) + case_x, case_y, case_y_norm = normalize_data(df_case_time, df_case_volume) + + control_gp = fit_gaussian_process(control_x, control_y_norm, len(df_control_time.columns)) + case_gp = fit_gaussian_process(case_x, case_y_norm, len(df_case_time.columns)) + + kl_divergence = calculate_kl_divergence(control_x, case_x, control_gp, case_gp) + + with open("python/temp_statistics_results.txt", "w") as f: + print(kl_divergence, file=f) diff --git a/vignettes/getting-started.pdf b/vignettes/getting-started.pdf index 5a6dd75..b296811 100644 Binary files a/vignettes/getting-started.pdf and b/vignettes/getting-started.pdf differ