Skip to content

Commit

Permalink
PLS overhaul
Browse files Browse the repository at this point in the history
  • Loading branch information
fradav committed Jun 3, 2021
1 parent f82254b commit 7da165b
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/standalone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: abcranger-build
on: [push]

env:
VCPKG_REVISION: "1be51cb4579e8f34317a0e53256095219ea85702"
VCPKG_REVISION: "5568f110b509a9fd90711978a7cb76bae75bb092"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_SKIP: "cp27-* cp35-* pp* *-manylinux_i686 *-win32"
CIBW_BEFORE_ALL_LINUX: bash presetup.sh
Expand Down
11 changes: 3 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,10 @@ else()
target_link_libraries(ranger Threads::Threads)
endif()


project(abcranger VERSION 1.0.5 LANGUAGES CXX C)

include(pmm.cmake)

# if(MSVC)
# set(BOOSTREGEX "boost-regex")
# endif()

if(MSVC AND MAKE_STATIC_EXE)
set(VCPKG_TARGET_TRIPLET "x64-windows-static")
endif()
Expand All @@ -84,9 +79,9 @@ pmm(
# VERBOSE
# DEBUG
VCPKG
# REVISION e37cc662ee29852f45e85961124f62d91acb488a
REVISION 1be51cb4579e8f34317a0e53256095219ea85702
# REVISION 6185aa76504a5025f36754324abf307cc776f3da
# REVISION 1be51cb4579e8f34317a0e53256095219ea85702
# REVISION ed54efbb16ae8a457296c7a7e1462a9f54571e94
REVISION 5568f110b509a9fd90711978a7cb76bae75bb092
TRIPLET ${VCPKG_TARGET_TRIPLET}
REQUIRES range-v3 catch2 cxxopts fmt eigen3-head pybind11 ${TBBPKG} ${BOOSTREPKG} ${TESTDEPS}
PORTS "${PROJECT_SOURCE_DIR}/eigen3-head"
Expand Down
2 changes: 1 addition & 1 deletion eigen3-head/CONTROL
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Source: eigen3-head
Version: 3.3.90
Version: 3.3.91
Description: C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.
8 changes: 5 additions & 3 deletions eigen3-head/portfile.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ endif()
vcpkg_from_gitlab(
GITLAB_URL https://gitlab.com
OUT_SOURCE_PATH SOURCE_PATH
REPO libeigen/eigen
REF 6b9c92fe7eff0dedb031cec38004c9c3667f3057
SHA512 09e9264f520851db2bd52d08683296d7c9738a9daa9a673b4e3bffb61279ac43a585ceb3373de17d1a77c59ec6bb66dd8d55eac4c53bce6b72509bed0eb75e50
REPO fradav/eigen
REF fd16e6d7e2bee0100cf48167723edfbf6b60e3ba
SHA512 b53b940cc4f5ca7f472b97ed631ef5518f88a2920eca79f6cc86b3a0672a81ddc996c2c6a0e3186b962aa3dba1f862a77131ec724158a03c01cd58c350b0a82c
# REF 6b9c92fe7eff0dedb031cec38004c9c3667f3057
# SHA512 09e9264f520851db2bd52d08683296d7c9738a9daa9a673b4e3bffb61279ac43a585ceb3373de17d1a77c59ec6bb66dd8d55eac4c53bce6b72509bed0eb75e50
HEAD_REF master
)

Expand Down
21 changes: 18 additions & 3 deletions src/EstimParam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ using namespace ranger;
using namespace Eigen;
using namespace ranges;

/**
* @brief Main parameter estimation
*
* @tparam MatrixType
* @param myread input reftable
* @param statobs osbserved (possibly multiple)
* @param opts command line options
* @param quiet mute output
* @param weights if we should give weights
* @return EstimParamResults structured results
*/
template <class MatrixType>
EstimParamResults EstimParam_fun(Reftable<MatrixType> &myread,
MatrixXd statobs,
Expand Down Expand Up @@ -100,7 +111,9 @@ EstimParamResults EstimParam_fun(Reftable<MatrixType> &myread,
MatrixXd Projection;
RowVectorXd mean, std;
size_t nComposante_sel;

if (!quiet) {
std::cout << "Computing PLS axis" << std::endl;
}
const std::string &pls_filename = outfile + ".plsvar";
std::ofstream pls_file;
if (!quiet)
Expand All @@ -109,7 +122,9 @@ EstimParamResults EstimParam_fun(Reftable<MatrixType> &myread,
auto validvars = filterConstantVars(myread.stats);
if (plsmaxvar == 0.0)
{
std::cout << "Using elbow method for selecting PLS axes" << std::endl;
if (!quiet) {
std::cout << "Using elbow method for selecting PLS axes" << std::endl;
}
VectorXd percentYvar = pls(myread.stats(all,validvars),
y,
(ncomp_total - (nstat - validvars.size())), Projection, mean, std, true);
Expand Down Expand Up @@ -138,7 +153,7 @@ EstimParamResults EstimParam_fun(Reftable<MatrixType> &myread,
}

if (!quiet)
std::cout << "Selecting only " << nComposante_sel << " pls components." << std::endl;
std::cout << std::endl << "Selecting only " << nComposante_sel << " pls components." << std::endl;

double sumPlsweights = Projection.col(0).array().abs().sum();
auto weightedPlsfirst = Projection.col(0) / sumPlsweights;
Expand Down
117 changes: 75 additions & 42 deletions src/pls-eigen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,46 @@
#include <list>
#include <algorithm>
#include <range/v3/all.hpp>
#include "tqdm.hpp"

using namespace Eigen;
using namespace std;
using namespace ranges;

/**
* @brief filters out constant variables
*
* @tparam Derived
* @param xr table to filter
* @return std::vector<size_t> indexes of valid vars
*/
template<class Derived>
std::vector<size_t> filterConstantVars(const MatrixBase<Derived>& xr) {
auto meanr = xr.colwise().mean();
auto stdr = ((xr.rowwise() - meanr).array().square().colwise().sum() / (xr.rows() - 1)).sqrt();;
std::vector<size_t> validvars;
RowVectorXd meanr = xr.colwise().mean();
VectorXd stdr = ((xr.rowwise() - meanr).array().square().colwise().sum() / (xr.rows() - 1)).sqrt();;
std::vector<size_t> validvars(xr.cols());
size_t m = 0;
for(size_t i = 0; i< xr.cols(); i++) {
if (stdr(i) >= 1.0e-8) validvars.push_back(i);
if (stdr(i) >= 1.0e-8) validvars[m++] = i;
}
validvars.resize(m);
return validvars;
}

/**
* @brief Apply PLS on x regarding y
*
* @tparam Derived
* @tparam OtherDerived
* @param x input
* @param y output
* @param ncomp number of components expected
* @param Projection the projection matrix
* @param mean mean of variables in x
* @param std standard deviation of variables in x
* @param stopping elbow heuristic enabled
* @return VectorXd explained variance for each computed components
*/
template<class Derived, class OtherDerived>
VectorXd pls(const MatrixBase<Derived>& x,
const MatrixBase<OtherDerived>& y,
Expand All @@ -44,53 +68,53 @@ VectorXd pls(const MatrixBase<Derived>& x,
RowVectorXd& std,
bool stopping = false)
{
auto n = x.rows();
auto p = x.cols();
size_t n = x.rows();
size_t p = x.cols();
mean = x.colwise().mean();
ncomp = std::min(std::min(n,p),ncomp);
std = ((x.rowwise() - mean).array().square().colwise().sum() / (x.rows() - 1)).sqrt();;
MatrixXd X = (x.rowwise() - mean).array().rowwise() / std.array();
MatrixXd Xo = X;
MatrixXd XX = X.transpose() * X;
MatrixXd Y = MatrixXd::Zero(n, ncomp + 1);
MatrixXd P(p,ncomp);
MatrixXd R(p,ncomp);
VectorXd XY(n);
VectorXd w(p);
VectorXd r(p);
VectorXd t(p);
MatrixXd X0 = X;
MatrixXd Ptilde(ncomp,p);
MatrixXd Wstar(p,ncomp);
VectorXd res(ncomp);
size_t window_size = std::max(2_z,ncomp/10_z);

std::list<unsigned char> stopping_criterium(window_size,0);
// Y.col(0) = y.rowwise() - y.colwise().mean();
double ymean = y.mean();
Y.col(0).array() = ymean;
MatrixXd w_k, t_k, p_k, y_k;
y_k = y;
double SSTO = (y.array() - ymean).array().square().sum();
int m = 0;
// for (auto m = 0; m < ncomp; m++)
tqdm bar;

while (m < ncomp)
{
XY = X.transpose() * y;
SelfAdjointEigenSolver<MatrixXd> es( XY.transpose() * XY );
auto abs_eigenvalues = es.eigenvalues().array().abs();
size_t max_eigenvalue_indice = ranges::distance(abs_eigenvalues.begin(),ranges::max_element(abs_eigenvalues));
auto q = es.eigenvectors().col(max_eigenvalue_indice);
w = XY * q;
w /= sqrt((w.transpose()*w)(0,0));
r=w;
for (auto j=0; j<=m-1;j++)
{
r -= (P.col(j).transpose()*w)(0,0)*R.col(j);
}
R.col(m) = r;
t = Xo * r;
P.col(m) = XX *r/(t.transpose()*t)(0,0);
VectorXd Zm = X * XY;
double Znorm = Zm.array().square().sum();
double Thetam = Zm.dot(y) / Znorm;
Y.col(m + 1) = Y.col(m) + Thetam * Zm;
X -= Zm.rowwise().replicate(p) * ((Zm/Znorm).transpose() * X).asDiagonal();
res(m) = (Y.col(m + 1).array() - ymean).array().square().sum() / SSTO;
bar.progress(m,ncomp);
// $w_{k}=\frac{X_{k-1}^{T} y_{k-1}}{\left\|X_{k-1}^{T} y_{k-1}\right\|}$
w_k = X.transpose() * y; // (p)
w_k /= sqrt((w_k.transpose()*w_k)(0,0));
// $\mathbf{W}^{*}_{p \times K} = [w_1, \ldots, w_K]$
Wstar.col(m) = w_k;
// $t_{k}=X_{k-1}w_{k}$
t_k = X * w_k; // (n)
double t_k_s = (t_k.transpose() * t_k)(0,0);
// $p_{k}=\frac{X_{k-1}^{T} t_{k}}{t_{k}^{T} t_{k}}$
p_k = (X.transpose() * t_k) / t_k_s; // (p)
// $\widetilde{\mathbf{P}}_{K \times p}=\mathbf{t}\left[p_{1}, \ldots, p_{K}\right]$
Ptilde.row(m) = p_k.transpose();
// $q_{k}=\frac{y_{k-1}^{T} t_{k}}{t_{k}^{T} t_{k}}$
double q_k = (y_k.transpose() * t_k)(0,0) / t_k_s; //(n,n)
// $y_{k}=y_{k-1}-q_{k} t_{k}$
y_k -= q_k * t_k;
// $X_{k}=X_{k-1}-t_{k} p_{k}^{T}$
X -= (t_k * p_k.transpose());

// $$Yvar^m = \frac{\sum_{i=1}^{N}{(\hat{y}^{m}_{i}-\bar{y})^2}}{\sum_{i=1}^{N}{(y_{i}-\hat{y})^2}}$$
res(m) = 1 - (y_k.array() - ymean).array().square().sum() / SSTO;

// Elbow heuristic
// $$Yvar^m = \frac{\sum_{i=1}^{N}{(\hat{y}^{m}_{i}-\bar{y})^2}}{\sum_{i=1}^{N}{(y_{i}-\hat{y})^2}}$$
if ((m >= 2) && stopping) {
auto lastdiff = res(m) - res(m-1);
auto lastmean = (res(m) + res(m-1))/2.0;
Expand All @@ -100,13 +124,22 @@ VectorXd pls(const MatrixBase<Derived>& x,
auto wcrit = ranges::accumulate(stopping_criterium,0);
if (wcrit == window_size) break;
}

m++;
}
if (m < ncomp) {
m--;
res = res(seq(0,m)).eval();
}
Projection = R;
// VectorXd res = (Y.block(0,1,n,ncomp).array() - ymean).array().square().colwise().sum() / SSTO;

// $\mathbf{W}=\mathbf{W}^{*}\left(\widetilde{\mathbf{P}} \mathbf{W}^{*}\right)^{-1}$
auto solver = (Ptilde*Wstar).completeOrthogonalDecomposition();
Projection = Wstar*solver.pseudoInverse();
return res;
}
}

// Tenenhaus, M. L’approche PLS. Revue de statistique appliquée 47, 5–40 (1999).
// Vancolen, S. La Régression PLS. Mémoire Postgrade en Statistiques, University of Neuchâtel (Switzerland) 1--28 (2004).
// Wold, S., Sjöström, M. & Eriksson, L. PLS-regression: a basic tool of chemometrics. Chemometrics and intelligent laboratory systems 58, 109–130 (2001).
// Mémoire m2 de ghislain : https://plmbox.math.cnrs.fr/f/1192b14f90ea44a1b26a/
// Krämer, N. An overview on the shrinkage properties of partial least squares regression. Computational Statistics 22, 249–273 (2007).
3 changes: 2 additions & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,13 @@ target_sources(parse_parexpr-test PRIVATE ${ABCRANGER_SOURCE_DIR}/parse_parexpr.
target_sources(quantile-test PRIVATE ${ABCRANGER_SOURCE_DIR}/forestQuantiles.cpp)
target_sources(forestestimparam-ks-test PRIVATE ${ABCRANGER_SOURCE_DIR}/EstimParam.cpp)
target_sources(forestmodelchoice-ks-test PRIVATE ${ABCRANGER_SOURCE_DIR}/ModelChoice.cpp)

target_sources(pls-eigen-test PRIVATE ${ABCRANGER_SOURCE_DIR}/tqdm.cpp)

target_link_libraries(readreftable-test ${TBBLIB} ${BOOSTLIBS} HighFive hdf5::hdf5_cpp-static fmt::fmt)
target_link_libraries(forestestimparam-ks-test ${TBBLIB} cxxopts::cxxopts abcrangerlib fmt::fmt)
target_link_libraries(forestmodelchoice-ks-test ${TBBLIB} cxxopts::cxxopts abcrangerlib fmt::fmt)
target_link_libraries(cxxopts-test cxxopts::cxxopts)
target_link_libraries(pls-eigen-test ${TBBLIB} fmt::fmt)

target_sources(forestclass-test PRIVATE ${RANGER_SOURCE_DIR}/ForestClassification.cpp)
target_sources(forestreg-test PRIVATE ${RANGER_SOURCE_DIR}/ForestRegression.cpp)

0 comments on commit 7da165b

Please sign in to comment.