diff --git a/.github/workflows/standalone.yml b/.github/workflows/standalone.yml index 02337cb..4f8b33c 100644 --- a/.github/workflows/standalone.yml +++ b/.github/workflows/standalone.yml @@ -5,7 +5,7 @@ name: abcranger-build on: [push] env: - VCPKG_REVISION: "1be51cb4579e8f34317a0e53256095219ea85702" + VCPKG_REVISION: "5568f110b509a9fd90711978a7cb76bae75bb092" CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 CIBW_SKIP: "cp27-* cp35-* pp* *-manylinux_i686 *-win32" CIBW_BEFORE_ALL_LINUX: bash presetup.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d60885..13800aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,15 +52,10 @@ else() target_link_libraries(ranger Threads::Threads) endif() - project(abcranger VERSION 1.0.5 LANGUAGES CXX C) include(pmm.cmake) -# if(MSVC) -# set(BOOSTREGEX "boost-regex") -# endif() - if(MSVC AND MAKE_STATIC_EXE) set(VCPKG_TARGET_TRIPLET "x64-windows-static") endif() @@ -84,9 +79,9 @@ pmm( # VERBOSE # DEBUG VCPKG - # REVISION e37cc662ee29852f45e85961124f62d91acb488a - REVISION 1be51cb4579e8f34317a0e53256095219ea85702 - # REVISION 6185aa76504a5025f36754324abf307cc776f3da + # REVISION 1be51cb4579e8f34317a0e53256095219ea85702 + # REVISION ed54efbb16ae8a457296c7a7e1462a9f54571e94 + REVISION 5568f110b509a9fd90711978a7cb76bae75bb092 TRIPLET ${VCPKG_TARGET_TRIPLET} REQUIRES range-v3 catch2 cxxopts fmt eigen3-head pybind11 ${TBBPKG} ${BOOSTREPKG} ${TESTDEPS} PORTS "${PROJECT_SOURCE_DIR}/eigen3-head" diff --git a/eigen3-head/CONTROL b/eigen3-head/CONTROL index 39706de..de20a46 100644 --- a/eigen3-head/CONTROL +++ b/eigen3-head/CONTROL @@ -1,3 +1,3 @@ Source: eigen3-head -Version: 3.3.90 +Version: 3.3.91 Description: C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms. diff --git a/eigen3-head/portfile.cmake b/eigen3-head/portfile.cmake index e1cfc0b..4e8c1fc 100644 --- a/eigen3-head/portfile.cmake +++ b/eigen3-head/portfile.cmake @@ -10,9 +10,11 @@ endif() vcpkg_from_gitlab( GITLAB_URL https://gitlab.com OUT_SOURCE_PATH SOURCE_PATH - REPO libeigen/eigen - REF 6b9c92fe7eff0dedb031cec38004c9c3667f3057 - SHA512 09e9264f520851db2bd52d08683296d7c9738a9daa9a673b4e3bffb61279ac43a585ceb3373de17d1a77c59ec6bb66dd8d55eac4c53bce6b72509bed0eb75e50 + REPO fradav/eigen + REF fd16e6d7e2bee0100cf48167723edfbf6b60e3ba + SHA512 b53b940cc4f5ca7f472b97ed631ef5518f88a2920eca79f6cc86b3a0672a81ddc996c2c6a0e3186b962aa3dba1f862a77131ec724158a03c01cd58c350b0a82c +# REF 6b9c92fe7eff0dedb031cec38004c9c3667f3057 +# SHA512 09e9264f520851db2bd52d08683296d7c9738a9daa9a673b4e3bffb61279ac43a585ceb3373de17d1a77c59ec6bb66dd8d55eac4c53bce6b72509bed0eb75e50 HEAD_REF master ) diff --git a/src/EstimParam.cpp b/src/EstimParam.cpp index 8ccd5e4..bd1244b 100644 --- a/src/EstimParam.cpp +++ b/src/EstimParam.cpp @@ -25,6 +25,17 @@ using namespace ranger; using namespace Eigen; using namespace ranges; +/** + * @brief Main parameter estimation + * + * @tparam MatrixType + * @param myread input reftable + * @param statobs osbserved (possibly multiple) + * @param opts command line options + * @param quiet mute output + * @param weights if we should give weights + * @return EstimParamResults structured results + */ template EstimParamResults EstimParam_fun(Reftable &myread, MatrixXd statobs, @@ -100,7 +111,9 @@ EstimParamResults EstimParam_fun(Reftable &myread, MatrixXd Projection; RowVectorXd mean, std; size_t nComposante_sel; - + if (!quiet) { + std::cout << "Computing PLS axis" << std::endl; + } const std::string &pls_filename = outfile + ".plsvar"; std::ofstream pls_file; if (!quiet) @@ -109,7 +122,9 @@ EstimParamResults EstimParam_fun(Reftable &myread, auto validvars = filterConstantVars(myread.stats); if (plsmaxvar == 0.0) { - std::cout << "Using elbow method for selecting PLS axes" << std::endl; + if (!quiet) { + std::cout << "Using elbow method for selecting PLS axes" << std::endl; + } VectorXd percentYvar = pls(myread.stats(all,validvars), y, (ncomp_total - (nstat - validvars.size())), Projection, mean, std, true); @@ -138,7 +153,7 @@ EstimParamResults EstimParam_fun(Reftable &myread, } if (!quiet) - std::cout << "Selecting only " << nComposante_sel << " pls components." << std::endl; + std::cout << std::endl << "Selecting only " << nComposante_sel << " pls components." << std::endl; double sumPlsweights = Projection.col(0).array().abs().sum(); auto weightedPlsfirst = Projection.col(0) / sumPlsweights; diff --git a/src/pls-eigen.hpp b/src/pls-eigen.hpp index ccbe4b1..2bff4bf 100644 --- a/src/pls-eigen.hpp +++ b/src/pls-eigen.hpp @@ -19,22 +19,46 @@ #include #include #include +#include "tqdm.hpp" using namespace Eigen; using namespace std; using namespace ranges; +/** + * @brief filters out constant variables + * + * @tparam Derived + * @param xr table to filter + * @return std::vector indexes of valid vars + */ template std::vector filterConstantVars(const MatrixBase& xr) { - auto meanr = xr.colwise().mean(); - auto stdr = ((xr.rowwise() - meanr).array().square().colwise().sum() / (xr.rows() - 1)).sqrt();; - std::vector validvars; + RowVectorXd meanr = xr.colwise().mean(); + VectorXd stdr = ((xr.rowwise() - meanr).array().square().colwise().sum() / (xr.rows() - 1)).sqrt();; + std::vector validvars(xr.cols()); + size_t m = 0; for(size_t i = 0; i< xr.cols(); i++) { - if (stdr(i) >= 1.0e-8) validvars.push_back(i); + if (stdr(i) >= 1.0e-8) validvars[m++] = i; } + validvars.resize(m); return validvars; } +/** + * @brief Apply PLS on x regarding y + * + * @tparam Derived + * @tparam OtherDerived + * @param x input + * @param y output + * @param ncomp number of components expected + * @param Projection the projection matrix + * @param mean mean of variables in x + * @param std standard deviation of variables in x + * @param stopping elbow heuristic enabled + * @return VectorXd explained variance for each computed components + */ template VectorXd pls(const MatrixBase& x, const MatrixBase& y, @@ -44,53 +68,53 @@ VectorXd pls(const MatrixBase& x, RowVectorXd& std, bool stopping = false) { - auto n = x.rows(); - auto p = x.cols(); + size_t n = x.rows(); + size_t p = x.cols(); mean = x.colwise().mean(); + ncomp = std::min(std::min(n,p),ncomp); std = ((x.rowwise() - mean).array().square().colwise().sum() / (x.rows() - 1)).sqrt();; MatrixXd X = (x.rowwise() - mean).array().rowwise() / std.array(); - MatrixXd Xo = X; - MatrixXd XX = X.transpose() * X; - MatrixXd Y = MatrixXd::Zero(n, ncomp + 1); - MatrixXd P(p,ncomp); - MatrixXd R(p,ncomp); - VectorXd XY(n); - VectorXd w(p); - VectorXd r(p); - VectorXd t(p); + MatrixXd X0 = X; + MatrixXd Ptilde(ncomp,p); + MatrixXd Wstar(p,ncomp); VectorXd res(ncomp); size_t window_size = std::max(2_z,ncomp/10_z); std::list stopping_criterium(window_size,0); - // Y.col(0) = y.rowwise() - y.colwise().mean(); double ymean = y.mean(); - Y.col(0).array() = ymean; + MatrixXd w_k, t_k, p_k, y_k; + y_k = y; double SSTO = (y.array() - ymean).array().square().sum(); int m = 0; - // for (auto m = 0; m < ncomp; m++) + tqdm bar; + while (m < ncomp) { - XY = X.transpose() * y; - SelfAdjointEigenSolver es( XY.transpose() * XY ); - auto abs_eigenvalues = es.eigenvalues().array().abs(); - size_t max_eigenvalue_indice = ranges::distance(abs_eigenvalues.begin(),ranges::max_element(abs_eigenvalues)); - auto q = es.eigenvectors().col(max_eigenvalue_indice); - w = XY * q; - w /= sqrt((w.transpose()*w)(0,0)); - r=w; - for (auto j=0; j<=m-1;j++) - { - r -= (P.col(j).transpose()*w)(0,0)*R.col(j); - } - R.col(m) = r; - t = Xo * r; - P.col(m) = XX *r/(t.transpose()*t)(0,0); - VectorXd Zm = X * XY; - double Znorm = Zm.array().square().sum(); - double Thetam = Zm.dot(y) / Znorm; - Y.col(m + 1) = Y.col(m) + Thetam * Zm; - X -= Zm.rowwise().replicate(p) * ((Zm/Znorm).transpose() * X).asDiagonal(); - res(m) = (Y.col(m + 1).array() - ymean).array().square().sum() / SSTO; + bar.progress(m,ncomp); + // $w_{k}=\frac{X_{k-1}^{T} y_{k-1}}{\left\|X_{k-1}^{T} y_{k-1}\right\|}$ + w_k = X.transpose() * y; // (p) + w_k /= sqrt((w_k.transpose()*w_k)(0,0)); + // $\mathbf{W}^{*}_{p \times K} = [w_1, \ldots, w_K]$ + Wstar.col(m) = w_k; + // $t_{k}=X_{k-1}w_{k}$ + t_k = X * w_k; // (n) + double t_k_s = (t_k.transpose() * t_k)(0,0); + // $p_{k}=\frac{X_{k-1}^{T} t_{k}}{t_{k}^{T} t_{k}}$ + p_k = (X.transpose() * t_k) / t_k_s; // (p) + // $\widetilde{\mathbf{P}}_{K \times p}=\mathbf{t}\left[p_{1}, \ldots, p_{K}\right]$ + Ptilde.row(m) = p_k.transpose(); + // $q_{k}=\frac{y_{k-1}^{T} t_{k}}{t_{k}^{T} t_{k}}$ + double q_k = (y_k.transpose() * t_k)(0,0) / t_k_s; //(n,n) + // $y_{k}=y_{k-1}-q_{k} t_{k}$ + y_k -= q_k * t_k; + // $X_{k}=X_{k-1}-t_{k} p_{k}^{T}$ + X -= (t_k * p_k.transpose()); + + // $$Yvar^m = \frac{\sum_{i=1}^{N}{(\hat{y}^{m}_{i}-\bar{y})^2}}{\sum_{i=1}^{N}{(y_{i}-\hat{y})^2}}$$ + res(m) = 1 - (y_k.array() - ymean).array().square().sum() / SSTO; + + // Elbow heuristic + // $$Yvar^m = \frac{\sum_{i=1}^{N}{(\hat{y}^{m}_{i}-\bar{y})^2}}{\sum_{i=1}^{N}{(y_{i}-\hat{y})^2}}$$ if ((m >= 2) && stopping) { auto lastdiff = res(m) - res(m-1); auto lastmean = (res(m) + res(m-1))/2.0; @@ -100,13 +124,22 @@ VectorXd pls(const MatrixBase& x, auto wcrit = ranges::accumulate(stopping_criterium,0); if (wcrit == window_size) break; } + m++; } if (m < ncomp) { m--; res = res(seq(0,m)).eval(); } - Projection = R; -// VectorXd res = (Y.block(0,1,n,ncomp).array() - ymean).array().square().colwise().sum() / SSTO; + + // $\mathbf{W}=\mathbf{W}^{*}\left(\widetilde{\mathbf{P}} \mathbf{W}^{*}\right)^{-1}$ + auto solver = (Ptilde*Wstar).completeOrthogonalDecomposition(); + Projection = Wstar*solver.pseudoInverse(); return res; -} \ No newline at end of file +} + +// Tenenhaus, M. L’approche PLS. Revue de statistique appliquée 47, 5–40 (1999). +// Vancolen, S. La Régression PLS. Mémoire Postgrade en Statistiques, University of Neuchâtel (Switzerland) 1--28 (2004). +// Wold, S., Sjöström, M. & Eriksson, L. PLS-regression: a basic tool of chemometrics. Chemometrics and intelligent laboratory systems 58, 109–130 (2001). +// Mémoire m2 de ghislain : https://plmbox.math.cnrs.fr/f/1192b14f90ea44a1b26a/ +// Krämer, N. An overview on the shrinkage properties of partial least squares regression. Computational Statistics 22, 249–273 (2007). \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 56aa138..1d29421 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -40,12 +40,13 @@ target_sources(parse_parexpr-test PRIVATE ${ABCRANGER_SOURCE_DIR}/parse_parexpr. target_sources(quantile-test PRIVATE ${ABCRANGER_SOURCE_DIR}/forestQuantiles.cpp) target_sources(forestestimparam-ks-test PRIVATE ${ABCRANGER_SOURCE_DIR}/EstimParam.cpp) target_sources(forestmodelchoice-ks-test PRIVATE ${ABCRANGER_SOURCE_DIR}/ModelChoice.cpp) - +target_sources(pls-eigen-test PRIVATE ${ABCRANGER_SOURCE_DIR}/tqdm.cpp) target_link_libraries(readreftable-test ${TBBLIB} ${BOOSTLIBS} HighFive hdf5::hdf5_cpp-static fmt::fmt) target_link_libraries(forestestimparam-ks-test ${TBBLIB} cxxopts::cxxopts abcrangerlib fmt::fmt) target_link_libraries(forestmodelchoice-ks-test ${TBBLIB} cxxopts::cxxopts abcrangerlib fmt::fmt) target_link_libraries(cxxopts-test cxxopts::cxxopts) +target_link_libraries(pls-eigen-test ${TBBLIB} fmt::fmt) target_sources(forestclass-test PRIVATE ${RANGER_SOURCE_DIR}/ForestClassification.cpp) target_sources(forestreg-test PRIVATE ${RANGER_SOURCE_DIR}/ForestRegression.cpp)