diff --git a/.devpy/cmds.py b/.devpy/cmds.py index e250281c3..039bc761e 100644 --- a/.devpy/cmds.py +++ b/.devpy/cmds.py @@ -4,6 +4,7 @@ import click from devpy import util +from devpy.cmds.meson import get_site_packages @click.command() @@ -17,7 +18,7 @@ def docs(build_dir, clean=False): print(f"Removing `{doc_dir}`") shutil.rmtree(doc_dir) - site_path = util.get_site_packages(build_dir) + site_path = get_site_packages() if site_path is None: print("No built scikit-tree found; run `./dev.py build` first.") sys.exit(1) diff --git a/docs/references.bib b/docs/references.bib index daddb32f6..989138c7c 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -2,10 +2,10 @@ % Try to keep this list in alphabetical order based on citing name @article{Li2019manifold, - title={Manifold Oblique Random Forests: Towards Closing the Gap on Convolutional Deep Networks}, - author={Li, Adam and Perry, Ronan and Huynh, Chester and Tomita, Tyler M and Mehta, Ronak and Arroyo, Jesus and Patsolic, Jesse and Falk, Benjamin and Vogelstein, Joshua T}, - journal={arXiv preprint arXiv:1909.11799}, - year={2019} + title = {Manifold Oblique Random Forests: Towards Closing the Gap on Convolutional Deep Networks}, + author = {Li, Adam and Perry, Ronan and Huynh, Chester and Tomita, Tyler M and Mehta, Ronak and Arroyo, Jesus and Patsolic, Jesse and Falk, Benjamin and Vogelstein, Joshua T}, + journal = {arXiv preprint arXiv:1909.11799}, + year = {2019} } @article{Meghana2019_geodesicrf, @@ -17,4 +17,20 @@ @article{Meghana2019_geodesicrf publisher = {arXiv}, year = {2019}, copyright = {arXiv.org perpetual, non-exclusive license} -} \ No newline at end of file +} + +% Causal + +@article{Athey2016GeneralizedRF, + title = {Generalized random forests}, + author = {Susan Athey and Julie Tibshirani and Stefan Wager}, + journal = {The Annals of Statistics}, + year = {2016} +} + +@article{chernozhukov2018double, + title = {Double/debiased machine learning for treatment and structural parameters}, + author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James}, + year = {2018}, + publisher = {Oxford University Press Oxford, UK} +} diff --git a/sktree/causal/__init__.py b/sktree/causal/__init__.py new file mode 100644 index 000000000..a9624b99d --- /dev/null +++ b/sktree/causal/__init__.py @@ -0,0 +1 @@ +from .tree import CausalForest, CausalTree diff --git a/sktree/causal/_grf_criterion.pxd b/sktree/causal/_grf_criterion.pxd new file mode 100644 index 000000000..02b3fec1f --- /dev/null +++ b/sktree/causal/_grf_criterion.pxd @@ -0,0 +1,75 @@ +# Licensed under the MIT License. +# Original Authors: +# - EconML +# - Vasilis Syrgkanis +# +# Modified by: Adam Li + +import numpy as np +cimport numpy as np + +from sklearn.tree._tree cimport DTYPE_t # Type of X +from sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight +from sklearn.tree._tree cimport SIZE_t # Type for indices and counters + +from sklearn.tree._criterion cimport Criterion, RegressionCriterion + + +cdef class GeneralizedMomentCriterion(RegressionCriterion): +# The A random vector of the linear moment equation for each sample of size (n_samples, n_outputs) + # these are the "weights" that are pre-computed. + cdef const DOUBLE_t[:, ::1] alpha + + # the approximate Jacobian evaluated at every single sample point + # random vector of the linear moment equation + # size (n_samples, n_outputs, n_outputs) + cdef const DOUBLE_t[:, :, ::1] pointJ + + cdef DOUBLE_t[:, ::1] rho # Proxy heterogeneity label: rho = E[J | X in Node]^{-1} m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) + cdef DOUBLE_t[:, ::1] moment # Moment for each sample: m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) + cdef DOUBLE_t[:, ::1] J # Node average jacobian: J(Node) = E[J | X in Node] of shape (n_outputs, n_outputs) + cdef DOUBLE_t[:, ::1] invJ # Inverse of node average jacobian: J(Node)^{-1} of shape (n_outputs, n_outputs) + cdef DOUBLE_t[:] parameter # Estimated node parameter: theta(Node) = E[J|X in Node]^{-1} E[A|X in Node] + cdef DOUBLE_t[:] parameter_pre # Preconditioned node parameter: theta_pre(Node) = E[A | X in Node] + + cdef SIZE_t n_relevant_outputs + + cdef int compute_sample_preparameter( + self, + DOUBLE_t[:] parameter_pre, + const DOUBLE_t[:, ::1] alpha, + DOUBLE_t weight, + SIZE_t sample_index, + ) nogil except -1 + cdef int compute_sample_parameter( + self, + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ + ) nogil except -1 + cdef int compute_sample_moment( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] alpha, + DOUBLE_t[:] parameter, + const DOUBLE_t[:, :, ::1] pointJ, + SIZE_t sample_index + ) except -1 nogil + cdef int compute_sample_rho( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] invJ, + SIZE_t sample_index + ) except -1 nogil + cdef int compute_sample_jacobian( + self, + DOUBLE_t[:, ::1] J, + const DOUBLE_t[:, :, ::1] pointJ, + DOUBLE_t weight, + SIZE_t sample_index, + ) except -1 nogil + cdef int compute_node_inv_jacobian( + self, + DOUBLE_t[:, ::1] J, + DOUBLE_t[:, ::1] invJ, + ) except -1 nogil diff --git a/sktree/causal/_grf_criterion.pyx b/sktree/causal/_grf_criterion.pyx new file mode 100644 index 000000000..90872859a --- /dev/null +++ b/sktree/causal/_grf_criterion.pyx @@ -0,0 +1,725 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False + +# +# This code contains a refactoring of the EconML GRF code, +# published under the following license and copyright: +# BSD 3-Clause License +# +# All rights reserved. + +from libc.stdlib cimport calloc +from libc.stdlib cimport free +from libc.string cimport memcpy +from libc.string cimport memset +from libc.math cimport fabs, sqrt + +import numpy as np +cimport numpy as np +np.import_array() + +from ._utils cimport matinv_, pinv_ + +cdef double INFINITY = np.inf + +# TODO: OPEN QUESTIONS +# 1. why is alpha = y * T and y * Z in CATE and iV forests? +# 2. why is pointJ = T x T and T x Z? + +cdef class GeneralizedMomentCriterion(RegressionCriterion): + """A generalization of the regression criterion in scikit-learn. + + Generalized criterion with moment equations was introduced in the generalized + random forest paper, which shows how many common forests are trained using this + framework of criterion. + + A criterion class that estimates local parameters defined via moment equations + of the form:: + + E[ m(J, A; theta(x)) | X=x] + + where our specific instance is a linear moment equation:: + + E[ J * theta(x) - A | X=x] = 0 + + where: + + - m(J, A; theta(x)) is the moment equation that is specified per modeling setup. + - J is the Jacobian array of shape (n_outputs, n_outputs) per sample + - A is the alpha weights of shape (n_outputs) per sample + - theta(x) is the parameters we are interested in of shape (n_outputs) per sample + - X is the data matrix of shape (n_samples, n_features) + + We have the following given points: + + - alpha is the weights per sample of shape (n_samples, n_outputs) + - pointJ is the pointwise Jacobian array per sample of shape (n_samples, n_outputs, n_outputs) + + Then we have the following estimating equations: + + J(Node) := E[J[i] | X[i] in Node] = sum_{i in Node} w[i] J[i] + moment[i] := J[i] * theta(Node) - A[i] + rho[i] := - J(Node)^{-1} (J[i] * theta(Node) - A[i]) + theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i] + theta(Node) := J(Node)^{-1} E[A[i] | X[i] in Node] = J(node)^{-1} theta_pre(node) + + Notes + ----- + Calculates impurity based on heterogeneity induced on the estimated parameters, based on + the proxy score defined in :footcite:`Athey2016GeneralizedRF`. + + Specifically, we compute node, proxy and children impurity similarly to a + mean-squared error setting, where these are in general "proxies" for the true + criterion: + + n_{C1} * n_{C2} / n_P^2 (\hat{\\theta}_{C1} - \hat{\\theta}_{C2})^2 + + as specified in Equation 5 of the paper :footcite:`Athey2016GeneralizedRF`. + The proxy is computed with Equation 9 of the paper: + + 1/n_{C1} (\sum_{i \in C1} \\rho_i)^2 + 1/n_{C2} (\sum_{i \in C2} \\rho_i)^2 + + where :math:`\\rho_i` is the pseudo-label for the ith sample. + """ + def __cinit__( + self, + SIZE_t n_outputs, + SIZE_t n_samples, + SIZE_t n_relevant_outputs, + SIZE_t n_y, + ): + """Initialize parameters for this criterion. Parent `__cinit__` is always called before children. + So we only perform extra initializations that were not perfomed by the parent classes. + + Parameters + ---------- + n_outputs : SIZE_t + The number of parameters/values to be estimated + n_samples : SIZE_t + The total number of rows in the 2d matrix y. + The total number of samples to fit on. + n_relevant_outputs : SIZE_t + We only care about the first n_relevant_outputs of these parameters/values + n_y : SIZE_t + The first n_y columns of the 2d matrix y, contain the raw labels y_{ik}, the rest are auxiliary variables + """ + # Most initializations are handled by __cinit__ of RegressionCriterion + # which is always called in cython. We initialize the extras. + if n_y > 1: + raise AttributeError("LinearMomentGRFCriterion currently only supports a scalar y") + + self.proxy_children_impurity = True # The children_impurity() only returns an approximate proxy + + self.n_outputs = + self.n_relevant_outputs = n_relevant_outputs + self.n_y = n_y + + # Allocate memory for the proxy for y, which rho in the generalized random forest + # Since rho is node dependent it needs to be re-calculated and stored for each sample + # in the node for every node we are investigating + self.rho = np.zeros((n_samples, n_outputs), dtype=np.float64) + self.moment = np.zeros((n_samples, n_outputs), dtype=np.float64) + self.J = np.zeros((n_outputs, n_outputs), dtype=np.float64) + self.invJ = np.zeros((n_outputs, n_outputs), dtype=np.float64) + self.parameter = np.zeros(n_outputs, dtype=np.float64) + self.parameter_pre = np.zeros(n_outputs, dtype=np.float64) + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) nogil except -1: + """Initialize the criterion object with data. + + Parameters + ---------- + y : DOUBLE_t 2D memoryview of shape (n_samples, n_y + n_outputs + n_outputs * n_outputs) + The input y array. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + + Notes + ----- + For generalized criterion, the y array has columns associated with the + actual 'y' in the first `n_y` columns, and the next `n_outputs` columns are associated + with the alpha vectors and the next `n_outputs * n_outputs` columns are + associated with the estimated sample Jacobian vectors. + + `n_relevant_outputs` is a number less than or equal to `n_outputs`, which + tracks the relevant outputs. + """ + # Initialize fields + self.y = y + self.sample_weight = sample_weight + self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + cdef SIZE_t n_features = self.n_features + cdef SIZE_t n_outputs = self.n_outputs + cdef SIZE_t n_y = self.n_y + + self.y = y[:, :n_y] # The first n_y columns of y are the original raw outcome + self.alpha = y[:, n_y:(n_y + n_outputs)] # A[i] part of the moment is the next n_outputs columns + # J[i] part of the moment is the next n_outputs * n_outputs columns, stored in Fortran contiguous format + self.pointJ = y[:, (n_y + n_outputs):(n_y + n_outputs + n_outputs * n_outputs)] + self.sample_weight = sample_weight # Store the sample_weight locally + self.samples = samples # Store the sample index structure used and updated by the splitter + self.weighted_n_samples = weighted_n_samples # Store total weight of all samples computed by splitter + + return 0 + + cdef double node_impurity( + self + ) noexcept nogil: + """Evaluate the impurity of the current node, i.e. the impurity of + samples[start:end]. + + This sums up the relevant metric over all "relevant outputs" (`n_relevant_outputs`) + for samples in the node and then normalizes to get the average impurity of the node. + Similarly, `sum_total` stores an (`n_outputs`,) array and should have been computed apriori. + + This distinctly generalizes the scikit-learn Regression Criterion, as the `n_outputs` + contains both `n_relevant_outputs` and extra outputs that are nuisance parameters. But + otherwise, the node impurity calculation follows that of a regression. + """ + + cdef double* sum_total = self.sum_total + cdef double impurity + cdef SIZE_t k + + impurity = self.sq_sum_total / self.weighted_n_node_samples + for k in range(self.n_relevant_outputs): + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + + return impurity / self.n_relevant_outputs + + cdef double proxy_impurity_improvement( + self + ) noexcept nogil: + """Compute a proxy of the impurity reduction. + + This method is used to speed up the search for the best split. It is a proxy quantity such that the + split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + + This sums up the relevant metric over all "relevant outputs" (`n_relevant_outputs`) + for samples in the node and then normalizes to get the average impurity of the node. + Similarly, `sum_left` and `sum_right` stores an (`n_outputs`,) array and should have + been computed apriori. + + This distinctly generalizes the scikit-learn Regression Criterion, as the `n_outputs` + contains both `n_relevant_outputs` and extra outputs that are nuisance parameters. But + otherwise, the node impurity calculation follows that of a regression. + """ + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef SIZE_t k + cdef double proxy_impurity_left = 0.0 + cdef double proxy_impurity_right = 0.0 + + for k in range(self.n_relevant_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] + + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) + + cdef void children_impurity( + self, + double* impurity_left, + double* impurity_right + ) noexept nogil: + """Evaluate the impurity in children nodes, i.e. the impurity of the + left child (samples[start:pos]) and the impurity the right child + (samples[pos:end]). Here we use the proxy child impurity: + impurity_child[k] = sum_{i in child} w[i] rho[i, k]^2 / weight(child) + - (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 + impurity_child = sum_{k in n_relevant_outputs} impurity_child[k] / n_relevant_outputs + """ + cdef SIZE_t pos = self.pos + cdef SIZE_t start = self.start + cdef DOUBLE_t y_ik + + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right + + cdef SIZE_t i, p, k, offset + cdef DOUBLE_t w = 1.0 + + # We calculate: sq_sum_left = sum_{i in child} w[i] rho[i, k]^2 + for p in range(start, pos): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = self.sample_weight[i] + + for k in range(self.n_relevant_outputs): + y_ik = self.rho[i, k] + sq_sum_left += w * y_ik * y_ik + # We calculate sq_sum_right = sq_sum_total - sq_sum_left + sq_sum_right = self.sq_sum_total - sq_sum_left + + # We normalize each sq_sum_child by the weight of that child + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right + + # We subtract from the impurity_child, the quantity: + # sum_{k in n_relevant_outputs} (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 + # = sum_{k in n_relevant_outputs} (sum_child[k] / weight(child))^2 + for k in range(self.n_relevant_outputs): + impurity_left[0] -= (self.sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (self.sum_right[k] / self.weighted_n_right) ** 2.0 + + impurity_left[0] /= self.n_relevant_outputs + impurity_right[0] /= self.n_relevant_outputs + + cdef int compute_sample_preparameter( + self, + DOUBLE_t[:] parameter_pre, + const DOUBLE_t[:, ::1] alpha, + DOUBLE_t weight, + SIZE_t sample_index, + ) nogil except -1: + """Calculate the un-normalized pre-conditioned parameter. + + theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i] + theta(node) := J(node)^{-1} theta_pre(node) + + Parameters + ---------- + parameter_pre : DOUBLE_t memoryview of size (n_outputs,) + To be computed node un-normalized pre-conditioned parameter theta_pre(node). + alpha : DOUBLE_t 2D memoryview of size (n_samples, n_outputs) + The memory view that contains the A[i] for each sample i + weight : DOUBLE_t + The weight of the sample. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t i, j, p + cdef DOUBLE_t w = 1.0 + + # compute parameter_pre per output dimension, j as the + # \sum_{i} w[i] * alpha[i, j] + for j in range(self.n_outputs): + parameter_pre[j] += weight * alpha[sample_index, j] + return 0 + + cdef int compute_sample_parameter( + self, + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ + ) nogil except -1: + """Calculate the parameter. + + theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i] + theta(node) := J(node)^{-1} theta_pre(node) + + Parameters + ---------- + parameter : DOUBLE_t memoryview of size (n_outputs,) + To be computed node parameter theta(node). + parameter_pre : DOUBLE_t memoryview of size (n_outputs,) + Already computed node un-normalized pre-conditioned parameter theta_pre(node) + invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs) + Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format. + """ + cdef SIZE_t i, j + + for j in range(self.n_outputs): + for i in range(self.n_outputs): + parameter[i] += invJ[i, j] * parameter_pre[j] + return 0 + + cdef int compute_sample_moment( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] alpha, + DOUBLE_t[:] parameter, + const DOUBLE_t[:, :, ::1] pointJ, + SIZE_t sample_index + ) except -1 nogil: + """Calculate the linear moment and rho for each sample i in the node. + + moment[i] := J[i] * theta(Node) - A[i] + rho[i] := - (J(Node) / weight(node))^{-1} * moment[i] + + Parameters + ---------- + moment : DOUBLE_t 2D memoryview of shape (`n_samples`, `n_outputs`) + Array of moments per sample in node. + alpha : DOUBLE_t[:, ::1] of size (n_samples, n_outputs) + The memory view that contains the A[i] for each sample i + parameter : DOUBLE_t memoryview of shape (`n_outputs`) + Array of computed parameters theta(x) per sample in node. + pointJ : DOUBLE_t[:, :, ::1] 3D memory of size (n_samples, n_outputs, n_outputs) + The memory view that contains the J[i] for each sample i, where J[i] is the + Jacobian array. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t j, k + + # compute the moment for each output + for j in range(self.n_outputs): + moment[sample_index, j] = - alpha[sample_index, j] + for k in range(self.n_outputs): + moment[sample_index, j] += pointJ[sample_index, j, k] * parameter[k] + return 0 + + cdef int compute_sample_rho( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] invJ, + SIZE_t sample_index + ) except -1 nogil: + """Calculate the rho for each sample i in the node. + + moment[i] := J[i] * theta(Node) - A[i] + rho[i] := - (J(Node) / weight(node))^{-1} * moment[i] + + Parameters + ---------- + moment : DOUBLE_t 2D memoryview of shape (`n_samples`, `n_outputs`) + Array of moments per sample in node. + invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs) + Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t j, k + + # compute the moment for each output + for j in range(self.n_outputs): + rho[sample_index, j] = 0.0 + for k in range(self.n_outputs): + rho[sample_index, j] -= ( + invJ[j, k] * \ + moment[sample_index, k] * \ + self.weighted_n_node_samples + ) + return 0 + + cdef int compute_sample_jacobian( + self, + DOUBLE_t[:, ::1] J, + const DOUBLE_t[:, :, ::1] pointJ, + DOUBLE_t weight, + SIZE_t sample_index, + ) except -1 nogil: + """Calculate the un-normalized jacobian for a sample:: + + J(node) := E[J[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] J[i] + + and its inverse J(node)^{-1} (or revert to pseudo-inverse if the matrix is not invertible). For + dimensions n_outputs={1, 2}, we also add a small constant of 1e-6 on the diagonals of J, to ensure + invertibility. For larger dimensions we revert to pseudo-inverse if the matrix is not invertible. + + Parameters + ---------- + J : DOUBLE_t 2D memoryview (n_outputs, n_outputs) + Un-normalized jacobian J(node) in C-contiguous format + pointJ : DOUBLE_t[:, :, ::1] 3D memory of size (n_samples, n_outputs, n_outputs) + The memory view that contains the J[i] for each sample i, where J[i] is the + Jacobian array. + weight : DOUBLE_t + The weight of the sample. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t j, k + + # Calculate un-normalized empirical jacobian + for j in range(self.n_outputs): + for k in range(self.n_outputs): + J[j, k] += w * pointJ[sample_index, j, k] + + return 0 + + cdef int compute_node_inv_jacobian( + self, + DOUBLE_t[:, ::1] J, + DOUBLE_t[:, ::1] invJ, + ) except -1 nogil: + """Calculate the node inverse un-normalized jacobian:: + + Its inverse J(node)^{-1} (or revert to pseudo-inverse if the matrix is not invertible). For + dimensions n_outputs={1, 2}, we also add a small constant of 1e-6 on the diagonals of J, to ensure + invertibility. For larger dimensions we revert to pseudo-inverse if the matrix is not invertible. + + Parameters + ---------- + J : DOUBLE_t 2D memoryview (n_outputs, n_outputs) + Un-normalized jacobian J(node) in C-contiguous format + invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs) + Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format. + """ + cdef SIZE_t k + + # Calculae inverse and store it in invJ + if self.n_outputs <=2: + # Fast closed form inverse calculation + _fast_invJ(J, invJ, self.n_outputs, clip=1e-6) + else: + for k in range(self.n_outputs): + J[k, k] += 1e-6 # add small diagonal constant for invertibility + + # Slower matrix inverse via lapack package + if not matinv_(J, invJ, self.n_outputs): # if matrix is invertible use the inverse + pinv_(J, invJ, self.n_outputs, self.n_outputs) # else calculate pseudo-inverse + + for k in range(self.n_outputs): + J[k, k] -= 1e-6 # remove the invertibility constant + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion and update statistics. + + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + self.start = start + self.end = end + + self.n_node_samples = end - start + + self.sq_sum_total = 0.0 + self.weighted_n_node_samples = 0. + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t y_ik + cdef DOUBLE_t w_y_ik + cdef DOUBLE_t w = 1.0 + + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) + + # Init jacobian matrix to zero + self.J[:] = 0.0 + + # init parameter parameter to zero + self.parameter_pre[:] = 0.0 + self.parameter[:] = 0.0 + + # perform the first loop to aggregate + for p in range(start, end): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = self.sample_weight[i] + self.weighted_n_node_samples += w + + # compute the Jacobian for this sample + self.compute_sample_jacobian( + self.J, self.pointJ, w, i) + + # compute the pre-conditioned parameter + self.compute_sample_preparameter( + self.parameter_pre, + self.alpha, + w, + i + ) + + # compute the inverse-Jacobian + self.compute_node_inv_jacobian(self.J, self.invJ) + + for p in range(start, end): + i = self.sample_indices[p] + if self.sample_weight is not None: + w = self.sample_weight[i] + + self.compute_sample_parameter( + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ + ) + # next compute the moment for this sample + self.compute_sample_moment( + self.moment, + self.alpha, + self.parameter, + self.pointJ, + i + ) + + # compute the pseudo-label, rho + self.compute_sample_rho( + self.moment, + self.invJ, + i + ) + + # compute statistics for the current total sum and square sum + # of the metrics for criterion, in this case the pseudo-label + for k in range(self.n_outputs): + y_ik = self.rho[i, k] + w_y_ik = w * y_ik + + self.sum_total[k] += w_y_ik + + if k < self.n_relevant_outputs + self.sq_sum_total += w_y_ik * y_ik + + # Reset to pos=start + self.reset() + + cdef int update( + self, + SIZE_t new_pos + ) except -1 nogil: + """Updated statistics by moving samples[pos:new_pos] to the left.""" + cdef SIZE_t pos = self.pos + cdef SIZE_t end = self.end + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + + # Update statistics up to new_pos + # + # Given that + # sum_left[x] + sum_right[x] = sum_total[x] + # var_left[x] + var_right[x] = var_total[x] + # and that sum_total and var_total are known, we are going to update + # sum_left and var_left from the direction that require the least amount + # of computations, i.e. from pos to new_pos or from end to new_pos. + + # The invariance of the update is that: + # sum_left[k] = sum_{i in Left} w[i] rho[i, k] + # var_left[k] = sum_{i in Left} w[i] pointJ[i, k, k] + # and similarly for the right child. Notably, the second is un-normalized, + # so to be used for further calculations it needs to be normalized by the child weight. + if (new_pos - pos) <= (end - new_pos): + for p in range(pos, new_pos): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = sample_weight[i] + + for k in range(self.n_outputs): + # we add w[i] * rho[i, k] to sum_left[k] + self.sum_left[k] += w * self.rho[i, k] + # we add w[i] * J[i, k, k] to var_left[k] + # self.var_left[k] += w * self.pointJ[i, k, k] + + self.weighted_n_left += w + else: + self.reverse_reset() + + for p in range(end - 1, new_pos - 1, -1): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = sample_weight[i] + + for k in range(self.n_outputs): + # we subtract w[i] * rho[i, k] from sum_left[k] + self.sum_left[k] -= w * self.rho[i, k] + # we subtract w[i] * J[i, k, k] from var_left[k] + # self.var_left[k] -= w * self.pointJ[i, k, k] + + self.weighted_n_left -= w + + self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_left) + for k in range(self.n_outputs): + self.sum_right[k] = self.sum_total[k] - self.sum_left[k] + self.var_right[k] = self.var_total[k] - self.var_left[k] + + self.pos = new_pos + return 0 + + + cdef void node_value(self, double* dest) nogil: + """Return the estimated node parameter of samples[start:end] into dest.""" + memcpy(dest, self.parameter, self.n_outputs * sizeof(double)) + + cdef void node_jacobian(self, double* dest) nogil: + """Return the node normalized Jacobian of samples[start:end] into dest in a C contiguous format.""" + cdef SIZE_t i, j + # Jacobian is stored in f-contiguous format for fortran. We translate it to c-contiguous for + # user interfacing. Moreover, we normalize by weight(node). + cdef SIZE_t n_outputs = self.n_outputs + for i in range(n_outputs): + for j in range(n_outputs): + dest[i * n_outputs + j] = self.J[i, j] / self.weighted_n_node_samples + + cdef void node_precond(self, double* dest) nogil: + """Return the normalized node preconditioned value of samples[start:end] into dest.""" + cdef SIZE_t i + for i in range(self.n_outputs): + dest[i] = self.parameter_pre[i] / self.weighted_n_node_samples + + + cdef double min_eig_left(self) nogil: + """ Calculate proxy for minimum eigenvalue of jacobian of left child. Here we simply + use the minimum absolute value of the diagonals of the jacobian. This proxy needs to be + super fast as this calculation happens for every candidate split. So we cannot afford + anything that is not very simple calculation. We tried a power iteration approximation + algorithm implemented in `_utils.fast_min_eigv_()`. But even such power iteration algorithm + is slow as it requires calculating a pseudo-inverse first. + """ + cdef int i + cdef double min, abs + min = fabs(self.var_left[0]) + for i in range(self.n_outputs): + abs = fabs(self.var_left[i]) + if abs < min: + min = abs + # The `update()` method maintains that var_left[k] = J_left[k, k], where J_left is the + # un-normalized jacobian of the left child. Thus we normalize by weight(left) + return min / self.weighted_n_left + + cdef double min_eig_right(self) nogil: + """ Calculate proxy for minimum eigenvalue of jacobian of right child + (see min_eig_left for more details). + """ + cdef int i + cdef double min, abs + min = fabs(self.var_right[0]) + for i in range(self.n_outputs): + abs = fabs(self.var_right[i]) + if abs < min: + min = abs + return min / self.weighted_n_right + + +cdef void _fast_invJ(DOUBLE_t* J, DOUBLE_t* invJ, SIZE_t n, double clip) nogil: + """Fast inversion of a 2x2 array.""" + cdef double det + if n == 1: + invJ[0] = 1.0 / J[0] if fabs(J[0]) >= clip else 1/clip # Explicit inverse calculation + elif n == 2: + # Explicit inverse calculation + det = J[0] * J[3] - J[1] * J[2] + if fabs(det) < clip: + det = clip + invJ[0] = J[3] / det + invJ[1] = - J[1] / det + invJ[2] = - J[2] / det + invJ[3] = J[0] / det diff --git a/sktree/causal/_splitter.pxd b/sktree/causal/_splitter.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/sktree/causal/_splitter.pyx b/sktree/causal/_splitter.pyx new file mode 100644 index 000000000..e69de29bb diff --git a/sktree/causal/_utils.pxd b/sktree/causal/_utils.pxd new file mode 100644 index 000000000..55808d76e --- /dev/null +++ b/sktree/causal/_utils.pxd @@ -0,0 +1,31 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import numpy as np +cimport numpy as np + +ctypedef np.npy_float64 DTYPE_t # Type of X +ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight +ctypedef np.npy_intp SIZE_t # Type for indices and counters +ctypedef np.npy_int32 INT32_t # Signed 32 bit integer +ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer + +cpdef bint matinv(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] inv_a) nogil + +cdef bint matinv_(DOUBLE_t* a, DOUBLE_t* inv_a, int m) nogil + +cpdef void lstsq(DOUBLE_t[::1,:] a, DOUBLE_t[::1,:] b, DOUBLE_t[::1, :] sol, bint copy_b=*) nogil + +cdef void lstsq_(DOUBLE_t* a, DOUBLE_t* b, DOUBLE_t* sol, int m, int n, int ldb, int nrhs, bint copy_b=*) nogil + +cpdef void pinv(DOUBLE_t[::1,:] a, DOUBLE_t[::1, :] sol) nogil + +cdef void pinv_(DOUBLE_t* a, DOUBLE_t* sol, int m, int n) nogil + +cpdef double fast_max_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil + +cdef double fast_max_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil + +cpdef double fast_min_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil + +cdef double fast_min_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil \ No newline at end of file diff --git a/sktree/causal/_utils.pyx b/sktree/causal/_utils.pyx new file mode 100644 index 000000000..ac536fbbc --- /dev/null +++ b/sktree/causal/_utils.pyx @@ -0,0 +1,278 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +from libc.stdlib cimport free +from libc.stdlib cimport malloc +from libc.stdlib cimport calloc +from libc.stdlib cimport realloc +from libc.string cimport memcpy +from libc.math cimport log as ln +from libc.stdlib cimport abort + +from scipy.linalg.cython_lapack cimport dgelsy, dgetrf, dgetri, dgecon, dlacpy, dlange + +import numpy as np +cimport numpy as np +np.import_array() + +from sklearn.tree._utils cimport rand_int + + +rcond_ = np.finfo(np.float64).eps +cdef inline double RCOND = rcond_ + + +# ============================================================================= +# Linear Algebra Functions +# ============================================================================= + + +cpdef bint matinv(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] inv_a) nogil: + """ Compute matrix inverse and store it in inv_a. + """ + cdef int m, n + m = a.shape[0] + if not (m == a.shape[1]): + raise ValueError("Can only invert square matrices!") + return matinv_(&a[0, 0], &inv_a[0, 0], m) + +cdef bint matinv_(DOUBLE_t* a, DOUBLE_t* inv_a, int m) nogil: + """ Compute matrix inverse of matrix a of size (m, m) and store it in inv_a. + """ + cdef: + int* pivot + DOUBLE_t* work + int lda, INFO, Lwork + bint failed + + lda = m + Lwork = m**2 + pivot = malloc(m * sizeof(int)) + work = malloc(Lwork * sizeof(DOUBLE_t)) + failed = False + if (pivot==NULL or work==NULL): + with gil: + raise MemoryError() + + try: + memcpy(inv_a, a, m * m * sizeof(DOUBLE_t)) + + #Conduct the LU factorization of the array a + dgetrf(&m, &m, inv_a, &lda, pivot, &INFO) + if not (INFO == 0): + failed = True + else: + #Now use the LU factorization and the pivot information to invert + dgetri(&m, inv_a, &lda, pivot, work, &Lwork, &INFO) + if not (INFO == 0): + failed = True + finally: + free(pivot) + free(work) + + return (not failed) + + + +cpdef void lstsq(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] b, DOUBLE_t[::1, :] sol, bint copy_b=True) nogil: + """ Compute solution to least squares problem min ||b - a sol||_2^2, + where a is a matrix of size (m, n), b is (m, nrhs). Store (n, nrhs) solution in sol. + The memory view b, must have at least max(m, n) rows. If m < n, then pad remainder with zeros. + If copy_b=True, then b is left unaltered on output. Otherwise b is altered by this call. + """ + cdef int m, n, nrhs + m = a.shape[0] + n = a.shape[1] + nrhs = b.shape[1] + ldb = b.shape[0] + if ldb < max(m, n): + with gil: + raise ValueError("Matrix b must have first dimension at least max(a.shape[0], a.shape[1]). " + "Please pad with zeros.") + if (sol.shape[0] != n) or (sol.shape[1] != nrhs): + with gil: + raise ValueError("Matrix sol must have dimensions (a.shape[1], b.shape[1]).") + lstsq_(&a[0, 0], &b[0, 0], &sol[0, 0], m, n, ldb, nrhs, copy_b) + + +cdef void lstsq_(DOUBLE_t* a, DOUBLE_t* b, DOUBLE_t* sol, int m, int n, int ldb, int nrhs, bint copy_b=True) nogil: + """ Compute solution to least squares problem min ||b - a sol||_2^2, + where a is a matrix of size (m, n), b is (m, nrhs). Store (n, nrhs) solution in sol. + The leading (row) dimension b, must be at least max(m, n). If m < n, then pad remainder with zeros. + If copy_b=True, then b is left unaltered on output. Otherwise b is altered by this call. + """ + cdef: + int lda, rank, info, lwork, n_out + double rcond + Py_ssize_t i, j + #array pointers + int* jpvt + double* work + double* b_copy + char* UPLO = 'O' #Any letter other then 'U' or 'L' will copy entire array + lda = m + if ldb < max(m, n): + with gil: + raise ValueError("Matrix b must have dimension at least max(a.shape[0], a.shape[1]). " + "Please pad with zeros.") + rcond = max(m, n) * RCOND + jpvt = calloc(n, sizeof(int)) + lwork = max(min(n, m) + 3 * n + 1, 2 * min(n, m) + nrhs) + work = malloc(lwork * sizeof(DOUBLE_t)) + + # TODO. can we avoid all this malloc and copying in our context? + a_copy = calloc(lda * n, sizeof(DOUBLE_t)) + if copy_b: + b_copy = calloc(ldb * nrhs, sizeof(DOUBLE_t)) + else: + b_copy = b + try: + dlacpy(UPLO, &lda, &n, a, &lda, a_copy, &lda) + if copy_b: + dlacpy(UPLO, &ldb, &nrhs, b, &ldb, b_copy, &ldb) + + dgelsy(&m, &n, &nrhs, a_copy, &lda, b_copy, &ldb, + &jpvt[0], &rcond, &rank, &work[0], &lwork, &info) + + for i in range(n): + for j in range(nrhs): + sol[i + j * n] = b_copy[i + j * ldb] + + finally: + free(jpvt) + free(work) + free(a_copy) + if copy_b: + free(b_copy) + +cpdef void pinv(DOUBLE_t[::1,:] a, DOUBLE_t[::1, :] sol) nogil: + """ Compute pseudo-inverse of (m, n) matrix a and store it in (n, m) matrix sol. + Matrix a is left un-altered by this call. + """ + cdef int m = a.shape[0] + cdef int n = a.shape[1] + pinv_(&a[0, 0], &sol[0, 0], m, n) + +cdef void pinv_(DOUBLE_t* a, DOUBLE_t* sol, int m, int n) nogil: + """ Compute pseudo-inverse of (m, n) matrix a and store it in (n, m) matrix sol. + Matrix a is left un-altered by this call. + """ + # TODO. can we avoid this mallon in our context. Maybe create some fixed memory allocations? + cdef int ldb = max(m, n) + cdef double* b = calloc(ldb * m, sizeof(double)) + cdef Py_ssize_t i + for i in range(m): + b[i + i * ldb] = 1.0 + try: + lstsq_(a, b, sol, m, n, ldb, m, copy_b=False) + + finally: + free(b) + + +cpdef double fast_max_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil: + """ Calculate approximation of maximum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state + """ + return fast_max_eigv_(&A[0, 0], A.shape[0], reps, &random_state) + +cdef double fast_max_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil: + """ Calculate approximation of maximum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state + """ + cdef int t, i, j + cdef double normx, Anormx + cdef double* xnew + cdef double* xold + cdef double* temp + xnew = NULL + xold = NULL + + try: + xnew = calloc(n, sizeof(double)) + xold = calloc(n, sizeof(double)) + + if xnew == NULL or xold == NULL: + with gil: + raise MemoryError() + for i in range(n): + xold[i] = (1 - 2*rand_int(0, 2, random_state)) + for t in range(reps): + for i in range(n): + xnew[i] = 0 + for j in range(n): + xnew[i] += A[i + j * n] * xold[j] + temp = xold + xold = xnew + xnew = temp + normx = 0 + Anormx = 0 + for i in range(n): + normx += xnew[i] * xnew[i] + for j in range(n): + Anormx += xnew[i] * A[i + j * n] * xnew[j] + + return Anormx / normx + finally: + free(xnew) + free(xold) + + +cpdef double fast_min_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil: + """ Calculate approximation of minimum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state + """ + return fast_min_eigv_(&A[0, 0], A.shape[0], reps, &random_state) + +cdef double fast_min_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil: + """ Calculate approximation of minimum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state. + """ + cdef int t, i, j + cdef double normx, Anormx + cdef double* xnew + cdef double* xold + cdef double* temp + cdef double* update + xnew = NULL + xold = NULL + + try: + xnew = calloc(n, sizeof(double)) + xold = calloc(n, sizeof(double)) + update = calloc(n, sizeof(double)) + + if xnew == NULL or xold == NULL or update == NULL: + with gil: + raise MemoryError() + for i in range(n): + xold[i] = (1 - 2*rand_int(0, 2, random_state)) + for t in range(reps): + lstsq_(A, xold, update, n, n, n, 1, copy_b=False) + for i in range(n): + xnew[i] = 0 + for j in range(n): + xnew[i] += update[i] + temp = xold + xold = xnew + xnew = temp + normx = 0 + Anormx = 0 + for i in range(n): + normx += xnew[i] * xnew[i] + for j in range(n): + Anormx += xnew[i] * A[i + j * n] * xnew[j] + + return Anormx / normx + finally: + free(xnew) + free(xold) + free(update) diff --git a/sktree/causal/base.py b/sktree/causal/base.py new file mode 100644 index 000000000..821f6d6e0 --- /dev/null +++ b/sktree/causal/base.py @@ -0,0 +1,7 @@ +# XXX: trying to sketch out a base causal API that extends "BaseEstimator" of sklearn +class BaseCausal: + def conditional_effect(self, X=None, *, T0, T1): + pass + + def marginal_effect(self, T, X=None): + pass diff --git a/sktree/causal/meson.build b/sktree/causal/meson.build new file mode 100644 index 000000000..784ca03f3 --- /dev/null +++ b/sktree/causal/meson.build @@ -0,0 +1,29 @@ +extensions = [ + '_grf_criterion', + '_splitter', + '_utils', +] + +foreach ext: extensions + py3.extension_module(ext, + cython_gen_cpp.process(ext + '.pyx'), + c_args: cython_c_args, + include_directories: [incdir_numpy], + install: true, + subdir: 'sktree/causal', + ) +endforeach + +python_sources = [ + '__init__.py', + 'tree.py', + 'meta.py', +] + +py3.install_sources( + python_sources, + subdir: 'sktree/causal' # Folder relative to site-packages to install to +) + +# TODO: comment in if we include tests +subdir('tests') \ No newline at end of file diff --git a/sktree/causal/meta.py b/sktree/causal/meta.py new file mode 100644 index 000000000..300dc5d98 --- /dev/null +++ b/sktree/causal/meta.py @@ -0,0 +1,399 @@ +import numbers +from itertools import product + +import numpy as np +from joblib import Parallel, delayed +from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier +from sklearn.linear_model import LassoCV, LogisticRegressionCV +from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, check_cv +from sklearn.utils import check_random_state +from sklearn.utils.validation import _check_fit_params, check_is_fitted, indexable + + +def _parallel_crossfit_nuisance( + estimator_treatment, + estimator_outcome, + X, + y, + t, + sample_weight, + train, + test, + verbose, + split_progress=None, # (split_idx, n_splits), +): + """Crossfitting nuisance functions in parallel. + + Parameters + ---------- + estimator_treatment : estimator object implementing 'fit' and 'predict' + The object to use to fit the data. + estimator_outcome : estimator object implementing 'fit' and 'predict' + The object to use to fit the data. + X : array-like of shape (n_samples, n_features) + The covariate data to fit. + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The outcome variable. + t : array-like of shape (n_samples,) or (n_samples, n_treatment_dims) or None + The treatment variable. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. In the case of + classification, splits are also ignored if they would result in any + single class carrying a negative weight in either child node. + train : array-like of shape (n_train_samples,) + Indices of training samples. + test : array-like of shape (n_test_samples,) + Indices of test samples. + verbose : int + The verbosity level. + split_progress : {list, tuple} of int, default=None + A list or tuple of format (, ). + + Returns + ------- + result : dict with the following attributes + fit_time : float + Time spent for fitting in seconds. + score_time : float + Time spent for scoring in seconds. + estimator_outcome : estimator object + The fitted estimator. + estimator_treatment : estimator object + The fitted estimator. + fit_error : str or None + Traceback str if the fit failed, None if the fit succeeded. + test_idx : array-like of shape (n_test_samples,) + The indices of the test samples. + y_residuals : array-like of shape (n_test_samples,) + The residuals of ``y_test - estimator_outcome.predict(X_test)``. + t_residuals : array-like of shape (n_test_samples,) + The residuals of ``t_test - estimator_treatment.predict(X_test)``. + """ + progress_msg = "" + if verbose > 2: + if split_progress is not None: + progress_msg = f" {split_progress[0]+1}/{split_progress[1]}" + + if verbose > 1: + params_msg = "" + + if verbose > 9: + start_msg = f"[CV{progress_msg}] START {params_msg}" + print(f"{start_msg}{(80 - len(start_msg)) * '.'}") + + # fit the nuisance functions on the + X_train, X_test = X[train, :], X[test, :] + y_train, y_test = y[train, :], y[test, :] + t_train, t_test = t[train, :], t[test, :] + + y_pred = estimator_outcome.fit(X_train, y_train, sample_weight=sample_weight).predict(X_test) + t_pred = estimator_treatment.fit(X_train, t_train, sample_weight=sample_weight).predict(X_test) + + # compute the resulting residuals + y_residuals = y_test - y_pred + t_residuals = t_test - t_pred + + result = dict() + result["estimator_outcome"] = estimator_outcome + result["estimator_treatment"] = estimator_treatment + result["y_residuals"] = y_residuals + result["t_residuals"] = t_residuals + + return result + + +def check_outcome_estimator(estimator_outcome, y, random_state): + """Check outcome estimator and error-check. + + Parameters + ---------- + estimator_outcome : estimator object + The outcome estimator model. If None, then LassoCV will be used. + y : array-like of shape (n_samples, n_output) + The outcomes array. + random_state : int + Random seed. + + Returns + ------- + estimator_outcome : estimator object + The cloned outcome estimator model. + """ + if estimator_outcome is None: + return LassoCV(random_state=random_state) + else: + if not hasattr(estimator_outcome, "fit") or not hasattr(estimator_outcome, "predict"): + raise RuntimeError( + "All outcome estimator models must have the `fit` and `predict` functions implemented." + ) + + estimator_outcome = clone(estimator_outcome) + # XXX: run some checks on y being compatible with the estimator + if is_classifier(estimator_outcome) and not issubclass(y.type, numbers.Integral): + raise RuntimeError( + "Treatment array is not integers, but the treatment estimator is classification based. " + "If treatment array is discrete, then treatment estimator should be a classifier. " + "If treatment array is continuous, thent treatment estimator should be a regressor." + ) + return estimator_outcome + + +def check_treatment_estimator(estimator_treatment, t, random_state): + """Check treatment estimator and error-check. + + Parameters + ---------- + estimator_treatment : estimator object + The treatment estimator model. If None, then LogisticRegressionCV will be used. + t : array-like of shape (n_samples, n_treatment_dims) + The treatment array. + random_state : int + Random seed. + + Returns + ------- + estimator_treatment : estimator object + The cloned treatment estimator model. + """ + if estimator_treatment is None: + estimator_treatment = LogisticRegressionCV(random_state=random_state) + else: + if not hasattr(estimator_treatment, "fit") or not hasattr(estimator_treatment, "predict"): + raise RuntimeError( + "All treatment estimator models must have the `fit` and `predict` functions implemented." + ) + + estimator_treatment = clone(estimator_treatment) + + # XXX: run some checks on t being compatible with the estimator. i.e. discrete -> classifier, otw regressor + if is_classifier(estimator_treatment) and not issubclass(t.type, numbers.Integral): + raise RuntimeError( + "Treatment array is not integers, but the treatment estimator is classification based. " + "If treatment array is discrete, then treatment estimator should be a classifier. " + "If treatment array is continuous, thent treatment estimator should be a regressor." + ) + + return estimator_treatment + + +class DML(MetaEstimatorMixin, BaseEstimator): + """A meta-estimator for performining double machine-learning. + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Estimator needs to provide a ``fit``, and ``predict_prob`` function. + estimator_outcome : estimator object, optional + The estimator model for the outcome, by default :class:`sklearn.linear_model.LassoCV`. + This is assumed to implement the scikit-learn estimator interface. Estimator needs + to provide a ``fit``, and ``predict_prob`` function. For more details, see Notes. + estimator_treatment : estimator object, optional + The estimator model for the treatment, by default :class:`sklearn.linear_model.LogisticRegressionCV`. + This is assumed to implement the scikit-learn estimator interface. Estimator needs to + provide a ``fit``, and ``predict_prob`` function. For more details, see Notes. + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the different training and test sets. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + estimator_outcome_ : list of estimator object + Fitted nuisance estimators of the outcome. + estimator_treatment_ : list of estimator object + Fitted nuisance estimators of the treatment. + estimator_final_ : estimator object + Fitted final-stage estimator. + + Notes + ----- + The CATE is modeled using the following equation: + + .. math :: + Y - \\E[Y | X, W] = \\Theta(X) \\cdot (T - \\E[T | X, W]) + \\epsilon + + where ``Y`` are the outcomes, ``X`` are the observed covariates that confound treatment, ``T`` + and outcomes. ``W`` are observed covariates that act as controls. :math:`\\epsilon`` is the noise + term of the data-generating model. The CATE is defined as :math:`Theta(X)`. + + Double machine learning (DML) follows a two stage process, where a set of nuisance functions + are estimated in the first stage in a crossfitting manner and a final stage estimates the CATE + model :footcite:`chernozhukov2018double`. See the User-guide for a description of two-stage learners, specifically on orthogonal + learning and doubly-robust learning. The two-stage procedure of DML follows the two steps: + + **1. Estimate nuisance functions** + There are two nuisance functions that need to be estimated, :math:`q(X, W) = E[Y | X, W]` and + :math:`f(X, W) = E[T | X, W]`, which are the models for the outcome and treatment + respectively. Here, one needs to run a regression (or classification model if treatments/outcomes + are discrete), that predicts outcomes and treatments given the covariates and control features. + + **2. Final stage regression of residuals on residuals** + Once :math:`q(X, W)` and :math:`f(X, W)` are estimated in the first stage, we compute + the residuals of the models with their observed outcome and treatment samples. + + .. math :: + \\tilde{Y} = Y - q(X, W) + \\tilde{T} = T - f(X, W) + + and then we fit a final stage model that takes the covariates, :math:`X` and :math:`\\tilde{T}`. That is + it uses the residuals of the treatment model and the covariates to predict the residuals of the outcome. + The fitted model represents the CATE. + + .. math :: + \\hat{\\theta} = \\arg\\min_{\\Theta}\ + \\E_n\\left[ (\\tilde{Y} - \\Theta(X) \\cdot \\tilde{T})^2 \\right] + + In the DML process, ``T`` and ``Y`` can be either discrete, or continuous and the resulting process will + be valid. For each stage of the DML process, one can use arbitrary sklearn-like estimators. + + **Estimation procedure with cross-fitting and cross-validation** + + In order to prevent overfitting to training data, the nuisance functions are estimated using training data + and the nuisance values (i.e. the residuals) are computed using the testing data. This is done on K-folds of + data, so the final residuals are computed from multiple model instantiations. If there is no cross-fitting, + then the nuisance functions are estimated and evaluated on the entire dataset. If there is at least K-fold + cross-fitting, then the data is split into K distinct groups and for each group, the model is trained on + the rest of the data, and evaluated on the held-out test group. + + The final model then takes as input the residuals of outcome, residuals of treatment and covariates to + estimate the CATE. + + References + ---------- + .. footbibliography:: + """ + + def __init__( + self, + estimator, + estimator_outcome=None, + estimator_treatment=None, + cv=None, + random_state=None, + n_jobs=None, + verbose=False, + ) -> None: + super().__init__() + self.estimator = estimator + self.estimator_outcome = estimator_outcome + self.estimator_treatment = estimator_treatment + self.cv = cv + self.n_jobs = n_jobs + self.random_state = random_state + self.verbose = verbose + + def fit(self, X, y, *, t=None, groups=None, **fit_params): + self._validate_params() + random_state = check_random_state(self.random_state) + + # run checks on input data + X, y = self._validate_data(X, y, multi_output=True, accept_sparse=False) + self._validate_data(y=t, multi_output=True, accept_sparse=False) + X, y, t, groups = indexable(X, y, t, groups) + fit_params = _check_fit_params(X, fit_params) + sample_weight = fit_params.get("sample_weight", None) + + # Determine output settings + _, self.n_features_in_ = X.shape + + # get cross-validation object + cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) + n_splits = cv.get_n_splits(X, y, groups) + + # XXX: this should not be required. If the user wants, they can pass in + # a CV object + if cv != self.cv and isinstance(cv, (KFold, StratifiedKFold)): + cv.shuffle = True + cv.random_state = random_state + + final_stage_estimator = clone(self.estimator) + + # initialize the models for treatment and outcome + estimator_outcome = check_outcome_estimator( + estimator_outcome=self.estimator_outcome, y=y, random_state=random_state + ) + estimator_treatment = check_treatment_estimator( + estimator_treatment=self.estimator_treatment, t=t, random_state=random_state + ) + + # fit the nuisance functions in parallel + parallel = Parallel(n_jobs=self.n_jobs) + with parallel: + out = parallel( + delayed(_parallel_crossfit_nuisance)( + estimator_treatment, + estimator_outcome, + final_stage_estimator, + X=X, + y=y, + t=t, + sample_weight=sample_weight, + train=train, + test=test, + verbose=self.verbose, + split_progress=(split_idx, n_splits), + ) + for (split_idx, (train, test)) in enumerate(cv.split(X, y, groups)) + ) + + y_residuals = np.zeros(y.shape) + t_residuals = np.zeros(t.shape) + + for result in out: + test = result["test_idx"] + y_residuals[test, :] = result["y_residuals"] + t_residuals[test, :] = result["t_residuals"] + + # now fit the final stage estimator + final_stage_estimator.fit(X=X, y=y_residuals, t=t_residuals, **fit_params) + + def predict(self, X): + """Predict the conditional average treatment effect given covariates X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data. + """ + pass + + def conditional_effect(self, X, t=None): + """Compute conditional effect of + + Parameters + ---------- + X : _type_ + _description_ + t : _type_, optional + _description_, by default None + """ + pass diff --git a/sktree/causal/tests/test_causal.py b/sktree/causal/tests/test_causal.py new file mode 100644 index 000000000..06a83b2f6 --- /dev/null +++ b/sktree/causal/tests/test_causal.py @@ -0,0 +1,20 @@ +import joblib +import numpy as np +import pytest +from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_array_equal +from sklearn import datasets +from sklearn.utils.estimator_checks import parametrize_with_checks + +from sktree.causal import CausalTree + + +@parametrize_with_checks( + [ + CausalTree(random_state=12), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + # TODO: remove when we implement Regressor classes + if check.func.__name__ in ["check_requires_y_none"]: + pytest.skip() + check(estimator) diff --git a/sktree/causal/tests/test_utils.py b/sktree/causal/tests/test_utils.py new file mode 100644 index 000000000..b886f3bad --- /dev/null +++ b/sktree/causal/tests/test_utils.py @@ -0,0 +1,46 @@ +import numpy as np + +from sktree.causal._utils import matinv, lstsq, pinv, fast_max_eigv, fast_min_eigv + + +def test_fast_eigv(self): + rng = np.random.default_rng(123) + + n = 4 + for _ in range(10): + A = rng.normal(0, 1, size=(n, n)) + A = np.asfortranarray(A @ A.T) + apx = fast_min_eigv(A, 5, 123) + opt = np.min(np.linalg.eig(A)[0]) + np.testing.assert_allclose(apx, opt, atol=.01, rtol=.3) + apx = fast_max_eigv(A, 10, 123) + opt = np.max(np.linalg.eig(A)[0]) + np.testing.assert_allclose(apx, opt, atol=.5, rtol=.2) + + +def test_linalg(): + rng = np.random.default_rng(1234) + + for n, m, nrhs in [(3, 3, 3), (3, 2, 1), (3, 1, 2), (1, 4, 2), (3, 4, 5)]: + for _ in range(100): + A = rng.normal(0, 1, size=(n, m)) + y = rng.normal(0, 1, size=(n, nrhs)) + yf = y + if m > n: + yf = np.zeros((m, nrhs)) + yf[:n] = y + ours = np.asfortranarray(np.zeros((m, nrhs))) + lstsq(np.asfortranarray(A), np.asfortranarray(yf.copy()), ours, copy_b=True) + true = np.linalg.lstsq(A, y, rcond=np.finfo(np.float64).eps * max(n, m))[0] + np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0) + + ours = np.asfortranarray(np.zeros(A.T.shape, dtype=np.float64)) + pinv(np.asfortranarray(A), ours) + true = np.linalg.pinv(A) + np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0) + + if n == m: + ours = np.asfortranarray(np.zeros(A.T.shape, dtype=np.float64)) + matinv(np.asfortranarray(A), ours) + true = np.linalg.inv(A) + np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0) diff --git a/sktree/causal/tree.py b/sktree/causal/tree.py new file mode 100644 index 000000000..f629523e9 --- /dev/null +++ b/sktree/causal/tree.py @@ -0,0 +1,391 @@ +from sklearn.base import BaseEstimator +from sklearn.ensemble._forest import BaseForest, ForestRegressor +from sklearn.tree import BaseDecisionTree, DecisionTreeClassifier, DecisionTreeRegressor + + +class CausalTree(BaseEstimator): + def __init__( + self, + tree=None, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + ccp_alpha=0.0, + honest_ratio=0.5, + **tree_params, + ) -> None: + """A causal tree. + + A causal tree is a decision tree model that operates over not only + ``(X, y)`` tuple of arrays, but also includes the treatment groups. + It is an estimator model that has a graphical model of the form + :math:`T \\rightarrow Y` with :math:`T \\leftarrow X \\rightarrow Y`. + + Parameters + ---------- + tree : _type_, optional + _description_, by default None + criterion : str, optional + _description_, by default "gini" + splitter : str, optional + _description_, by default "best" + max_depth : _type_, optional + _description_, by default None + min_samples_split : int, optional + _description_, by default 2 + min_samples_leaf : int, optional + _description_, by default 1 + min_weight_fraction_leaf : float, optional + _description_, by default 0.0 + max_features : _type_, optional + _description_, by default None + random_state : _type_, optional + _description_, by default None + max_leaf_nodes : _type_, optional + _description_, by default None + min_impurity_decrease : float, optional + _description_, by default 0.0 + class_weight : _type_, optional + _description_, by default None + ccp_alpha : float, optional + _description_, by default 0.0 + honest_ratio : float, optional + _description_, by default 0.5 + + Notes + ----- + Compared to a normal decision tree model, which estimates the value :math:`E[Y | X]`, + a causal decision tree splits the data in such a way to estimate the value + :math:`E[Y | do(T), X]`. This is a causal effect, and thus requires causal assumptions. + The core assumption is that of unconfoundedness, where the outcomes, ``Y``, and the + treatments, ``T`` are not confounded by any other non-observed variables. That is ``X`` + captures all possible confounders. This is typically a strong assumption, but is realized + in well-designed observational studies and randomized control trials, where the treatment + assignment is randomized. + """ + self.tree = tree + self.criterion = criterion + self.splitter = splitter + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.class_weight = class_weight + self.ccp_alpha = ccp_alpha + self.honest_ratio = honest_ratio + self.tree_params = tree_params + + def fit(self, X, y, sample_weight=None, T=None): + # initialize the tree estimator + if self.tree is None: + self.estimator_ = DecisionTreeClassifier( + criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=self.max_features, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + class_weight=self.class_weight, + ccp_alpha=self.ccp_alpha, + honest_ratio=1.0, + **self.tree_params, + ) + else: + if not issubclass(self.tree, BaseDecisionTree): + raise RuntimeError( + "The type of tree must be a subclass of sklearn.tree.BaseDecisionTree." + ) + + self.estimator_ = self.tree( + criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=self.max_features, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + class_weight=self.class_weight, + ccp_alpha=self.ccp_alpha, + honest_ratio=1.0, + **self.tree_params, + ) + + # fit the decision tree using only the treatment groups + self.estimator_.fit(X, y=T, sample_weight=sample_weight) + + # re-fit the tree leaves using the outcomes + refit_leaves(self.estimator_, X, y) + + def apply(self, X): + return self.estimator_.apply(X) + + def predict(self, X): + return self.estimator_.predict(X) + + def predict_proba(self, X): + return self.estimator_.predict_proba(X) + + def decision_path(self, X): + return self.estimator_.decision_path(X) + + @property + def feature_importances_(self): + """Return the feature importances. + + The importance of a feature is computed as the (normalized) total + reduction of the criterion brought by that feature. + It is also known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + Returns + ------- + feature_importances_ : ndarray of shape (n_features,) + Normalized total reduction of criteria by feature + (Gini importance). + """ + return self.estimator_.feature_importances_ + + +class ForestCausalRegressor(BaseForest): + @abstractmethod + def __init__( + self, + estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + max_samples=None, + base_estimator="deprecated", + ): + super().__init__( + estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=max_samples, + base_estimator=base_estimator, + ) + + def predict(self, X): + """ + Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the trees in the forest. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) + The predicted values. + """ + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) + else: + y_hat = np.zeros((X.shape[0]), dtype=np.float64) + + # Parallel loop + lock = threading.Lock() + Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( + delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) for e in self.estimators_ + ) + + y_hat /= len(self.estimators_) + + return y_hat + + @staticmethod + def _get_oob_predictions(tree, X): + """Compute the OOB predictions for an individual tree. + + Parameters + ---------- + tree : DecisionTreeRegressor object + A single decision tree regressor. + X : ndarray of shape (n_samples, n_features) + The OOB samples. + + Returns + ------- + y_pred : ndarray of shape (n_samples, 1, n_outputs) + The OOB associated predictions. + """ + y_pred = tree.predict(X, check_input=False) + if y_pred.ndim == 1: + # single output regression + y_pred = y_pred[:, np.newaxis, np.newaxis] + else: + # multioutput regression + y_pred = y_pred[:, np.newaxis, :] + return y_pred + + def _set_oob_score_and_attributes(self, X, y, scoring_function=None): + """Compute and set the OOB score and attributes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + y : ndarray of shape (n_samples, n_outputs) + The target matrix. + scoring_function : callable, default=None + Scoring function for OOB score. Defaults to `r2_score`. + """ + self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1) + if self.oob_prediction_.shape[-1] == 1: + # drop the n_outputs axis if there is a single output + self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1) + + if scoring_function is None: + scoring_function = r2_score + + self.oob_score_ = scoring_function(y, self.oob_prediction_) + + def _compute_partial_dependence_recursion(self, grid, target_features): + """Fast partial dependence computation. + + Parameters + ---------- + grid : ndarray of shape (n_samples, n_target_features) + The grid points on which the partial dependence should be + evaluated. + target_features : ndarray of shape (n_target_features) + The set of target features for which the partial dependence + should be evaluated. + + Returns + ------- + averaged_predictions : ndarray of shape (n_samples,) + The value of the partial dependence function on each grid point. + """ + grid = np.asarray(grid, dtype=DTYPE, order="C") + averaged_predictions = np.zeros(shape=grid.shape[0], dtype=np.float64, order="C") + + for tree in self.estimators_: + # Note: we don't sum in parallel because the GIL isn't released in + # the fast method. + tree.tree_.compute_partial_dependence(grid, target_features, averaged_predictions) + # Average over the forest + averaged_predictions /= len(self.estimators_) + + return averaged_predictions + + def _more_tags(self): + return {"multilabel": True} + + +# XXX: figure out how to make this either: +# - DML tree +# - DR tree +# - OrthoDML/DR tree +class CausalForest(ForestRegressor): + _parameter_constraints: dict = { + **ForestRegressor._parameter_constraints, + **DecisionTreeRegressor._parameter_constraints, + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + honest_ratio=0.5, + ): + super().__init__( + estimator=CausalTree(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "honest_ratio", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.ccp_alpha = ccp_alpha + self.honest_ratio = honest_ratio diff --git a/sktree/meson.build b/sktree/meson.build index 974e52d1f..de0076761 100644 --- a/sktree/meson.build +++ b/sktree/meson.build @@ -39,16 +39,12 @@ incdir_numpy = run_command(py3, ).stdout().strip() # inc_np = include_directories(incdir_numpy) +# only use non-deprecated Numpy API +numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' +cython_c_args = '-NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' + cc = meson.get_compiler('c') -# Don't use the deprecated NumPy C API. Define this to a fixed version instead of -# NPY_API_VERSION in order not to break compilation for released versions -# when NumPy introduces a new deprecation. Use in a meson.build file:: -# -# py3.extension_module('_name', -# 'source_fname', -# numpy_nodepr_api) -numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_9_API_VERSION' python_sources = [ '__init__.py', @@ -77,6 +73,7 @@ c_undefined_ok = ['-Wno-maybe-uninitialized'] # cython_c_args += ['-Wno-cpp'] # cython_cpp_args = cython_c_args +subdir('causal') subdir('tree') subdir('ensemble') subdir('tests') \ No newline at end of file diff --git a/sktree/tree/meson.build b/sktree/tree/meson.build index 6d2e330e2..4f9228a1c 100644 --- a/sktree/tree/meson.build +++ b/sktree/tree/meson.build @@ -15,13 +15,11 @@ foreach ext: extensions cython_gen_cpp.process(ext + '.pyx'), c_args: cython_c_args, include_directories: [incdir_numpy], - # override_options : ['cython_language=cpp'], install: true, subdir: 'sktree/tree', ) endforeach -# TODO: comment in _classes.py when we have a working Cython unsupervised tree with a Python API python_sources = [ '__init__.py', '_classes.py',