From 85bd5803a3a595860156926a6b0cb7525879e675 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Mar 2023 14:39:37 -0400 Subject: [PATCH 1/4] Adding basic DML causal API Signed-off-by: Adam Li --- .devpy/cmds.py | 3 +- docs/references.bib | 17 +- sktree/causal/__init__.py | 0 sktree/causal/meson.build | 13 + sktree/causal/meta.py | 405 +++++++++++++++++++++++++++++ sktree/causal/tests/test_causal.py | 0 sktree/causal/tree.py | 320 +++++++++++++++++++++++ sktree/meson.build | 13 +- sktree/tree/meson.build | 2 - 9 files changed, 758 insertions(+), 15 deletions(-) create mode 100644 sktree/causal/__init__.py create mode 100644 sktree/causal/meson.build create mode 100644 sktree/causal/meta.py create mode 100644 sktree/causal/tests/test_causal.py create mode 100644 sktree/causal/tree.py diff --git a/.devpy/cmds.py b/.devpy/cmds.py index e250281c3..039bc761e 100644 --- a/.devpy/cmds.py +++ b/.devpy/cmds.py @@ -4,6 +4,7 @@ import click from devpy import util +from devpy.cmds.meson import get_site_packages @click.command() @@ -17,7 +18,7 @@ def docs(build_dir, clean=False): print(f"Removing `{doc_dir}`") shutil.rmtree(doc_dir) - site_path = util.get_site_packages(build_dir) + site_path = get_site_packages() if site_path is None: print("No built scikit-tree found; run `./dev.py build` first.") sys.exit(1) diff --git a/docs/references.bib b/docs/references.bib index daddb32f6..94825f1ec 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -2,10 +2,10 @@ % Try to keep this list in alphabetical order based on citing name @article{Li2019manifold, - title={Manifold Oblique Random Forests: Towards Closing the Gap on Convolutional Deep Networks}, - author={Li, Adam and Perry, Ronan and Huynh, Chester and Tomita, Tyler M and Mehta, Ronak and Arroyo, Jesus and Patsolic, Jesse and Falk, Benjamin and Vogelstein, Joshua T}, - journal={arXiv preprint arXiv:1909.11799}, - year={2019} + title = {Manifold Oblique Random Forests: Towards Closing the Gap on Convolutional Deep Networks}, + author = {Li, Adam and Perry, Ronan and Huynh, Chester and Tomita, Tyler M and Mehta, Ronak and Arroyo, Jesus and Patsolic, Jesse and Falk, Benjamin and Vogelstein, Joshua T}, + journal = {arXiv preprint arXiv:1909.11799}, + year = {2019} } @article{Meghana2019_geodesicrf, @@ -17,4 +17,13 @@ @article{Meghana2019_geodesicrf publisher = {arXiv}, year = {2019}, copyright = {arXiv.org perpetual, non-exclusive license} +} + +% Causal + +@article{chernozhukov2018double, + title = {Double/debiased machine learning for treatment and structural parameters}, + author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James}, + year = {2018}, + publisher = {Oxford University Press Oxford, UK} } \ No newline at end of file diff --git a/sktree/causal/__init__.py b/sktree/causal/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sktree/causal/meson.build b/sktree/causal/meson.build new file mode 100644 index 000000000..904e85457 --- /dev/null +++ b/sktree/causal/meson.build @@ -0,0 +1,13 @@ +python_sources = [ + '__init__.py', + 'tree.py', + 'meta.py', +] + +py3.install_sources( + python_sources, + subdir: 'sktree/causal' # Folder relative to site-packages to install to +) + +# TODO: comment in if we include tests +subdir('tests') \ No newline at end of file diff --git a/sktree/causal/meta.py b/sktree/causal/meta.py new file mode 100644 index 000000000..af599e445 --- /dev/null +++ b/sktree/causal/meta.py @@ -0,0 +1,405 @@ +from itertools import product +import numbers +import numpy as np +from sklearn.base import MetaEstimatorMixin, BaseEstimator, is_classifier, clone +from sklearn.utils import check_random_state +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import check_cv, KFold, StratifiedKFold +from sklearn.utils.validation import indexable, check_is_fitted, _check_fit_params + +from sklearn.linear_model import LassoCV, LogisticRegressionCV + + +from joblib import Parallel, delayed + +class BaseCausal: + def conditional_effect(self, X=None, *, T0, T1): + pass + + def marginal_effect(self, T, X=None): + pass + + +def _parallel_crossfit_nuisance(estimator_treatment, + estimator_outcome, + X, + y, + t, + sample_weight, + train, + test, + verbose, + split_progress=None, #(split_idx, n_splits), + ): + """Crossfitting nuisance functions in parallel. + + Parameters + ---------- + estimator_treatment : estimator object implementing 'fit' and 'predict' + The object to use to fit the data. + estimator_outcome : estimator object implementing 'fit' and 'predict' + The object to use to fit the data. + X : array-like of shape (n_samples, n_features) + The covariate data to fit. + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The outcome variable. + t : array-like of shape (n_samples,) or (n_samples, n_treatment_dims) or None + The treatment variable. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. In the case of + classification, splits are also ignored if they would result in any + single class carrying a negative weight in either child node. + train : array-like of shape (n_train_samples,) + Indices of training samples. + test : array-like of shape (n_test_samples,) + Indices of test samples. + verbose : int + The verbosity level. + split_progress : {list, tuple} of int, default=None + A list or tuple of format (, ). + + Returns + ------- + result : dict with the following attributes + fit_time : float + Time spent for fitting in seconds. + score_time : float + Time spent for scoring in seconds. + estimator_outcome : estimator object + The fitted estimator. + estimator_treatment : estimator object + The fitted estimator. + fit_error : str or None + Traceback str if the fit failed, None if the fit succeeded. + test_idx : array-like of shape (n_test_samples,) + The indices of the test samples. + y_residuals : array-like of shape (n_test_samples,) + The residuals of ``y_test - estimator_outcome.predict(X_test)``. + t_residuals : array-like of shape (n_test_samples,) + The residuals of ``t_test - estimator_treatment.predict(X_test)``. + """ + progress_msg = "" + if verbose > 2: + if split_progress is not None: + progress_msg = f" {split_progress[0]+1}/{split_progress[1]}" + + if verbose > 1: + params_msg = "" + + if verbose > 9: + start_msg = f"[CV{progress_msg}] START {params_msg}" + print(f"{start_msg}{(80 - len(start_msg)) * '.'}") + + # fit the nuisance functions on the + X_train, X_test = X[train, :], X[test, :] + y_train, y_test = y[train, :], y[test, :] + t_train, t_test = t[train, :], t[test, :] + + y_pred = estimator_outcome.fit(X_train, y_train, sample_weight=sample_weight).predict(X_test) + t_pred = estimator_treatment.fit(X_train, t_train, sample_weight=sample_weight).predict(X_test) + + # compute the resulting residuals + y_residuals = y_test - y_pred + t_residuals = t_test - t_pred + + result = dict() + result['estimator_outcome'] = estimator_outcome + result['estimator_treatment'] = estimator_treatment + result['y_residuals'] = y_residuals + result['t_residuals'] = t_residuals + + return result + + +def check_outcome_estimator(estimator_outcome, y, random_state): + """Check outcome estimator and error-check. + + Parameters + ---------- + estimator_outcome : estimator object + The outcome estimator model. If None, then LassoCV will be used. + y : array-like of shape (n_samples, n_output) + The outcomes array. + random_state : int + Random seed. + + Returns + ------- + estimator_outcome : estimator object + The cloned outcome estimator model. + """ + if estimator_outcome is None: + return LassoCV(random_state=random_state) + else: + if not hasattr(estimator_outcome, 'fit') or not hasattr(estimator_outcome, 'predict'): + raise RuntimeError('All outcome estimator models must have the `fit` and `predict` functions implemented.') + + estimator_outcome = clone(estimator_outcome) + # XXX: run some checks on y being compatible with the estimator + if is_classifier(estimator_outcome) and not issubclass(y.type, numbers.Integral): + raise RuntimeError( + 'Treatment array is not integers, but the treatment estimator is classification based. ' + 'If treatment array is discrete, then treatment estimator should be a classifier. ' + 'If treatment array is continuous, thent treatment estimator should be a regressor.' + ) + return estimator_outcome + + +def check_treatment_estimator(estimator_treatment, t, random_state): + """Check treatment estimator and error-check. + + Parameters + ---------- + estimator_treatment : estimator object + The treatment estimator model. If None, then LogisticRegressionCV will be used. + t : array-like of shape (n_samples, n_treatment_dims) + The treatment array. + random_state : int + Random seed. + + Returns + ------- + estimator_treatment : estimator object + The cloned treatment estimator model. + """ + if estimator_treatment is None: + estimator_treatment = LogisticRegressionCV(random_state=random_state) + else: + if not hasattr(estimator_treatment, 'fit') or not hasattr(estimator_treatment, 'predict'): + raise RuntimeError('All treatment estimator models must have the `fit` and `predict` functions implemented.') + + estimator_treatment = clone(estimator_treatment) + + # XXX: run some checks on t being compatible with the estimator. i.e. discrete -> classifier, otw regressor + if is_classifier(estimator_treatment) and not issubclass(t.type, numbers.Integral): + raise RuntimeError( + 'Treatment array is not integers, but the treatment estimator is classification based. ' + 'If treatment array is discrete, then treatment estimator should be a classifier. ' + 'If treatment array is continuous, thent treatment estimator should be a regressor.' + ) + + return estimator_treatment + + +class DML(MetaEstimatorMixin, BaseEstimator): + """A meta-estimator for performining double machine-learning. + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Estimator needs to provide a ``fit``, and ``predict_prob`` function. + estimator_outcome : estimator object, optional + The estimator model for the outcome, by default :class:`sklearn.linear_model.LassoCV`. + This is assumed to implement the scikit-learn estimator interface. Estimator needs + to provide a ``fit``, and ``predict_prob`` function. For more details, see Notes. + estimator_treatment : estimator object, optional + The estimator model for the treatment, by default :class:`sklearn.linear_model.LogisticRegressionCV`. + This is assumed to implement the scikit-learn estimator interface. Estimator needs to + provide a ``fit``, and ``predict_prob`` function. For more details, see Notes. + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the different training and test sets. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + estimator_outcome_ : list of estimator object + Fitted nuisance estimators of the outcome. + estimator_treatment_ : list of estimator object + Fitted nuisance estimators of the treatment. + estimator_final_ : estimator object + Fitted final-stage estimator. + + Notes + ----- + The CATE is modeled using the following equation: + + .. math :: + Y - \\E[Y | X, W] = \\Theta(X) \\cdot (T - \\E[T | X, W]) + \\epsilon + + where ``Y`` are the outcomes, ``X`` are the observed covariates that confound treatment, ``T`` + and outcomes. ``W`` are observed covariates that act as controls. :math:`\\epsilon`` is the noise + term of the data-generating model. The CATE is defined as :math:`Theta(X)`. + + Double machine learning (DML) follows a two stage process, where a set of nuisance functions + are estimated in the first stage in a crossfitting manner and a final stage estimates the CATE + model :footcite:`chernozhukov2018double`. See the User-guide for a description of two-stage learners, specifically on orthogonal + learning and doubly-robust learning. The two-stage procedure of DML follows the two steps: + + **1. Estimate nuissance functions** + There are two nuissance functions that need to be estimated, :math:`q(X, W) = E[Y | X, W]` and + :math:`f(X, W) = E[T | X, W]`, which are the models for the outcome and treatment + respectively. Here, one needs to run a regression (or classification model if treatments/outcomes + are discrete), that predicts outcomes and treatments given the covariates and control features. + + **2. Final stage regression of residuals on residuals** + Once :math:`q(X, W)` and :math:`f(X, W)` are estimated in the first stage, we compute + the residuals of the models with their observed outcome and treatment samples. + + .. math :: + \\tilde{Y} = Y - q(X, W) + \\tilde{T} = T - f(X, W) + + and then we fit a final stage model that takes the covariates, :math:`X` and :math:`\\tilde{T}`. That is + it uses the residuals of the treatment model and the covariates to predict the residuals of the outcome. + The fitted model represents the CATE. + + .. math :: + \\hat{\\theta} = \\arg\\min_{\\Theta}\ + \\E_n\\left[ (\\tilde{Y} - \\Theta(X) \\cdot \\tilde{T})^2 \\right] + + In the DML process, ``T`` and ``Y`` can be either discrete, or continuous and the resulting process will + be valid. For each stage of the DML process, one can use arbitrary sklearn-like estimators. + + **Estimation procedure with cross-fitting and cross-validation** + + In order to prevent overfitting to training data, the nuisance functions are estimated using training data + and the nuisance values (i.e. the residuals) are computed using the testing data. This is done on K-folds of + data, so the final residuals are computed from multiple model instantiations. If there is no cross-fitting, + then the nuisance functions are estimated and evaluated on the entire dataset. If there is at least K-fold + cross-fitting, then the data is split into K distinct groups and for each group, the model is trained on + the rest of the data, and evaluated on the held-out test group. + + The final model then takes as input the residuals of outcome, residuals of treatment and covariates to + estimate the CATE. + + References + ---------- + .. footbibliography:: + """ + def __init__( + self, + estimator, + estimator_outcome=None, + estimator_treatment=None, + cv=None, + random_state=None, + n_jobs=None, + verbose=False, + ) -> None: + super().__init__() + self.estimator = estimator + self.estimator_outcome = estimator_outcome + self.estimator_treatment = estimator_treatment + self.cv = cv + self.n_jobs = n_jobs + self.random_state = random_state + self.verbose = verbose + + def fit(self, X, y, *, t=None, groups=None, **fit_params): + self._validate_params() + random_state = check_random_state(self.random_state) + + # run checks on input data + X, y = self._validate_data( + X, y, multi_output=True, accept_sparse=False + ) + self._validate_data( + y=t, multi_output=True, accept_sparse=False + ) + X, y, t, groups = indexable(X, y, t, groups) + fit_params = _check_fit_params(X, fit_params) + sample_weight = fit_params.get('sample_weight', None) + + # Determine output settings + _, self.n_features_in_ = X.shape + + # get cross-validation object + cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) + n_splits = cv.get_n_splits(X, y, groups) + + # XXX: this should not be required. If the user wants, they can pass in + # a CV object + if cv != self.cv and isinstance(cv, (KFold, StratifiedKFold)): + cv.shuffle = True + cv.random_state = random_state + + final_stage_estimator = clone(self.estimator) + + # initialize the models for treatment and outcome + estimator_outcome = check_outcome_estimator(estimator_outcome=self.estimator_outcome, y=y, random_state=random_state) + estimator_treatment = check_treatment_estimator(estimator_treatment=self.estimator_treatment, t=t, random_state=random_state) + + # fit the nuissance functions in parallel + parallel = Parallel(n_jobs=self.n_jobs) + with parallel: + out = parallel( + delayed(_parallel_crossfit_nuisance)( + estimator_treatment, + estimator_outcome, + final_stage_estimator, + X=X, + y=y, + t=t, + sample_weight=sample_weight, + train=train, + test=test, + verbose=self.verbose, + split_progress=(split_idx, n_splits), + ) + for (split_idx, (train, test)) in enumerate(cv.split(X, y, groups)) + ) + + y_residuals = np.zeros(y.shape) + t_residuals = np.zeros(t.shape) + + for result in out: + test = result['test_idx'] + y_residuals[test, :] = result['y_residuals'] + t_residuals[test, :] = result['t_residuals'] + + # now fit the final stage estimator + final_stage_estimator.fit(X=X, y=y_residuals, t=t_residuals, **fit_params) + + def predict(self, X): + """Predict the conditional average treatment effect given covariates X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data. + """ + pass + + def conditional_effect(self, X, t=None): + """Compute conditional effect of + + Parameters + ---------- + X : _type_ + _description_ + t : _type_, optional + _description_, by default None + """ + pass + + \ No newline at end of file diff --git a/sktree/causal/tests/test_causal.py b/sktree/causal/tests/test_causal.py new file mode 100644 index 000000000..e69de29bb diff --git a/sktree/causal/tree.py b/sktree/causal/tree.py new file mode 100644 index 000000000..4e67d4047 --- /dev/null +++ b/sktree/causal/tree.py @@ -0,0 +1,320 @@ +from sklearn.base import BaseEstimator + +from sklearn.ensemble._forest import BaseForest, ForestRegressor +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor + + +class CausalTree(BaseEstimator): + def __init__(self, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + ccp_alpha=0.0, + honest_ratio=0.5, + ) -> None: + self.criterion = criterion + self.splitter = splitter + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.class_weight = class_weight + self.ccp_alpha = ccp_alpha + self.honest_ratio = honest_ratio + + def fit(self, X, y, sample_weight=None, T=None): + # initialize the tree estimator + self.estimator_ = DecisionTreeClassifier( + criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=self.max_features, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + class_weight=self.class_weight, + ccp_alpha=self.ccp_alpha, + honest_ratio=1.0, + ) + + # fit the decision tree using only the treatment groups + self.estimator_.fit(X, T, sample_weight=sample_weight) + + # re-fit the tree leaves using the outcomes + refit_leaves(self.estimator_, X, y) + + def apply(self, X): + return self.estimator_.apply(X) + + def predict(self, X): + return self.estimator_.predict(X) + + def predict_proba(self, X): + return self.estimator_.predict_proba(X) + + def decision_path(self, X): + return self.estimator_.decision_path(X) + + @property + def feature_importances_(self): + """Return the feature importances. + + The importance of a feature is computed as the (normalized) total + reduction of the criterion brought by that feature. + It is also known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + Returns + ------- + feature_importances_ : ndarray of shape (n_features,) + Normalized total reduction of criteria by feature + (Gini importance). + """ + return self.estimator_.feature_importances_ + + + +class ForestCausalRegressor(BaseForest): + @abstractmethod + def __init__( + self, + estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + max_samples=None, + base_estimator="deprecated", + ): + super().__init__( + estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=max_samples, + base_estimator=base_estimator, + ) + + def predict(self, X): + """ + Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the trees in the forest. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) + The predicted values. + """ + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) + else: + y_hat = np.zeros((X.shape[0]), dtype=np.float64) + + # Parallel loop + lock = threading.Lock() + Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( + delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) + for e in self.estimators_ + ) + + y_hat /= len(self.estimators_) + + return y_hat + + @staticmethod + def _get_oob_predictions(tree, X): + """Compute the OOB predictions for an individual tree. + + Parameters + ---------- + tree : DecisionTreeRegressor object + A single decision tree regressor. + X : ndarray of shape (n_samples, n_features) + The OOB samples. + + Returns + ------- + y_pred : ndarray of shape (n_samples, 1, n_outputs) + The OOB associated predictions. + """ + y_pred = tree.predict(X, check_input=False) + if y_pred.ndim == 1: + # single output regression + y_pred = y_pred[:, np.newaxis, np.newaxis] + else: + # multioutput regression + y_pred = y_pred[:, np.newaxis, :] + return y_pred + + def _set_oob_score_and_attributes(self, X, y, scoring_function=None): + """Compute and set the OOB score and attributes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + y : ndarray of shape (n_samples, n_outputs) + The target matrix. + scoring_function : callable, default=None + Scoring function for OOB score. Defaults to `r2_score`. + """ + self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1) + if self.oob_prediction_.shape[-1] == 1: + # drop the n_outputs axis if there is a single output + self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1) + + if scoring_function is None: + scoring_function = r2_score + + self.oob_score_ = scoring_function(y, self.oob_prediction_) + + def _compute_partial_dependence_recursion(self, grid, target_features): + """Fast partial dependence computation. + + Parameters + ---------- + grid : ndarray of shape (n_samples, n_target_features) + The grid points on which the partial dependence should be + evaluated. + target_features : ndarray of shape (n_target_features) + The set of target features for which the partial dependence + should be evaluated. + + Returns + ------- + averaged_predictions : ndarray of shape (n_samples,) + The value of the partial dependence function on each grid point. + """ + grid = np.asarray(grid, dtype=DTYPE, order="C") + averaged_predictions = np.zeros( + shape=grid.shape[0], dtype=np.float64, order="C" + ) + + for tree in self.estimators_: + # Note: we don't sum in parallel because the GIL isn't released in + # the fast method. + tree.tree_.compute_partial_dependence( + grid, target_features, averaged_predictions + ) + # Average over the forest + averaged_predictions /= len(self.estimators_) + + return averaged_predictions + + def _more_tags(self): + return {"multilabel": True} + + +# XXX: figure out how to make this either: +# - DML tree +# - DR tree +# - OrthoDML/DR tree +class CausalForest(ForestRegressor): + _parameter_constraints: dict = { + **ForestRegressor._parameter_constraints, + **DecisionTreeRegressor._parameter_constraints, + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + honest_ratio=0.5, + ): + super().__init__( + estimator=CausalTree(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "honest_ratio", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.ccp_alpha = ccp_alpha + self.honest_ratio = honest_ratio diff --git a/sktree/meson.build b/sktree/meson.build index 974e52d1f..de0076761 100644 --- a/sktree/meson.build +++ b/sktree/meson.build @@ -39,16 +39,12 @@ incdir_numpy = run_command(py3, ).stdout().strip() # inc_np = include_directories(incdir_numpy) +# only use non-deprecated Numpy API +numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' +cython_c_args = '-NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' + cc = meson.get_compiler('c') -# Don't use the deprecated NumPy C API. Define this to a fixed version instead of -# NPY_API_VERSION in order not to break compilation for released versions -# when NumPy introduces a new deprecation. Use in a meson.build file:: -# -# py3.extension_module('_name', -# 'source_fname', -# numpy_nodepr_api) -numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_9_API_VERSION' python_sources = [ '__init__.py', @@ -77,6 +73,7 @@ c_undefined_ok = ['-Wno-maybe-uninitialized'] # cython_c_args += ['-Wno-cpp'] # cython_cpp_args = cython_c_args +subdir('causal') subdir('tree') subdir('ensemble') subdir('tests') \ No newline at end of file diff --git a/sktree/tree/meson.build b/sktree/tree/meson.build index 6d2e330e2..4f9228a1c 100644 --- a/sktree/tree/meson.build +++ b/sktree/tree/meson.build @@ -15,13 +15,11 @@ foreach ext: extensions cython_gen_cpp.process(ext + '.pyx'), c_args: cython_c_args, include_directories: [incdir_numpy], - # override_options : ['cython_language=cpp'], install: true, subdir: 'sktree/tree', ) endforeach -# TODO: comment in _classes.py when we have a working Cython unsupervised tree with a Python API python_sources = [ '__init__.py', '_classes.py', From 185560f50b0245f6b749db1bb225e4e89d9e2761 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Mar 2023 17:01:18 -0400 Subject: [PATCH 2/4] Add additional docstring Signed-off-by: Adam Li --- sktree/causal/__init__.py | 1 + sktree/causal/base.py | 7 ++ sktree/causal/meta.py | 142 ++++++++++++++--------------- sktree/causal/tests/test_causal.py | 20 ++++ sktree/causal/tree.py | 131 ++++++++++++++++++++------ 5 files changed, 197 insertions(+), 104 deletions(-) create mode 100644 sktree/causal/base.py diff --git a/sktree/causal/__init__.py b/sktree/causal/__init__.py index e69de29bb..a9624b99d 100644 --- a/sktree/causal/__init__.py +++ b/sktree/causal/__init__.py @@ -0,0 +1 @@ +from .tree import CausalForest, CausalTree diff --git a/sktree/causal/base.py b/sktree/causal/base.py new file mode 100644 index 000000000..821f6d6e0 --- /dev/null +++ b/sktree/causal/base.py @@ -0,0 +1,7 @@ +# XXX: trying to sketch out a base causal API that extends "BaseEstimator" of sklearn +class BaseCausal: + def conditional_effect(self, X=None, *, T0, T1): + pass + + def marginal_effect(self, T, X=None): + pass diff --git a/sktree/causal/meta.py b/sktree/causal/meta.py index af599e445..300dc5d98 100644 --- a/sktree/causal/meta.py +++ b/sktree/causal/meta.py @@ -1,36 +1,27 @@ -from itertools import product import numbers -import numpy as np -from sklearn.base import MetaEstimatorMixin, BaseEstimator, is_classifier, clone -from sklearn.utils import check_random_state -from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import check_cv, KFold, StratifiedKFold -from sklearn.utils.validation import indexable, check_is_fitted, _check_fit_params - -from sklearn.linear_model import LassoCV, LogisticRegressionCV - +from itertools import product +import numpy as np from joblib import Parallel, delayed - -class BaseCausal: - def conditional_effect(self, X=None, *, T0, T1): - pass - - def marginal_effect(self, T, X=None): - pass - - -def _parallel_crossfit_nuisance(estimator_treatment, - estimator_outcome, - X, - y, - t, - sample_weight, - train, - test, - verbose, - split_progress=None, #(split_idx, n_splits), - ): +from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier +from sklearn.linear_model import LassoCV, LogisticRegressionCV +from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, check_cv +from sklearn.utils import check_random_state +from sklearn.utils.validation import _check_fit_params, check_is_fitted, indexable + + +def _parallel_crossfit_nuisance( + estimator_treatment, + estimator_outcome, + X, + y, + t, + sample_weight, + train, + test, + verbose, + split_progress=None, # (split_idx, n_splits), +): """Crossfitting nuisance functions in parallel. Parameters @@ -92,7 +83,7 @@ def _parallel_crossfit_nuisance(estimator_treatment, start_msg = f"[CV{progress_msg}] START {params_msg}" print(f"{start_msg}{(80 - len(start_msg)) * '.'}") - # fit the nuisance functions on the + # fit the nuisance functions on the X_train, X_test = X[train, :], X[test, :] y_train, y_test = y[train, :], y[test, :] t_train, t_test = t[train, :], t[test, :] @@ -105,10 +96,10 @@ def _parallel_crossfit_nuisance(estimator_treatment, t_residuals = t_test - t_pred result = dict() - result['estimator_outcome'] = estimator_outcome - result['estimator_treatment'] = estimator_treatment - result['y_residuals'] = y_residuals - result['t_residuals'] = t_residuals + result["estimator_outcome"] = estimator_outcome + result["estimator_treatment"] = estimator_treatment + result["y_residuals"] = y_residuals + result["t_residuals"] = t_residuals return result @@ -133,19 +124,21 @@ def check_outcome_estimator(estimator_outcome, y, random_state): if estimator_outcome is None: return LassoCV(random_state=random_state) else: - if not hasattr(estimator_outcome, 'fit') or not hasattr(estimator_outcome, 'predict'): - raise RuntimeError('All outcome estimator models must have the `fit` and `predict` functions implemented.') + if not hasattr(estimator_outcome, "fit") or not hasattr(estimator_outcome, "predict"): + raise RuntimeError( + "All outcome estimator models must have the `fit` and `predict` functions implemented." + ) estimator_outcome = clone(estimator_outcome) # XXX: run some checks on y being compatible with the estimator if is_classifier(estimator_outcome) and not issubclass(y.type, numbers.Integral): raise RuntimeError( - 'Treatment array is not integers, but the treatment estimator is classification based. ' - 'If treatment array is discrete, then treatment estimator should be a classifier. ' - 'If treatment array is continuous, thent treatment estimator should be a regressor.' + "Treatment array is not integers, but the treatment estimator is classification based. " + "If treatment array is discrete, then treatment estimator should be a classifier. " + "If treatment array is continuous, thent treatment estimator should be a regressor." ) return estimator_outcome - + def check_treatment_estimator(estimator_treatment, t, random_state): """Check treatment estimator and error-check. @@ -167,17 +160,19 @@ def check_treatment_estimator(estimator_treatment, t, random_state): if estimator_treatment is None: estimator_treatment = LogisticRegressionCV(random_state=random_state) else: - if not hasattr(estimator_treatment, 'fit') or not hasattr(estimator_treatment, 'predict'): - raise RuntimeError('All treatment estimator models must have the `fit` and `predict` functions implemented.') - + if not hasattr(estimator_treatment, "fit") or not hasattr(estimator_treatment, "predict"): + raise RuntimeError( + "All treatment estimator models must have the `fit` and `predict` functions implemented." + ) + estimator_treatment = clone(estimator_treatment) # XXX: run some checks on t being compatible with the estimator. i.e. discrete -> classifier, otw regressor if is_classifier(estimator_treatment) and not issubclass(t.type, numbers.Integral): raise RuntimeError( - 'Treatment array is not integers, but the treatment estimator is classification based. ' - 'If treatment array is discrete, then treatment estimator should be a classifier. ' - 'If treatment array is continuous, thent treatment estimator should be a regressor.' + "Treatment array is not integers, but the treatment estimator is classification based. " + "If treatment array is discrete, then treatment estimator should be a classifier. " + "If treatment array is continuous, thent treatment estimator should be a regressor." ) return estimator_treatment @@ -255,8 +250,8 @@ class DML(MetaEstimatorMixin, BaseEstimator): model :footcite:`chernozhukov2018double`. See the User-guide for a description of two-stage learners, specifically on orthogonal learning and doubly-robust learning. The two-stage procedure of DML follows the two steps: - **1. Estimate nuissance functions** - There are two nuissance functions that need to be estimated, :math:`q(X, W) = E[Y | X, W]` and + **1. Estimate nuisance functions** + There are two nuisance functions that need to be estimated, :math:`q(X, W) = E[Y | X, W]` and :math:`f(X, W) = E[T | X, W]`, which are the models for the outcome and treatment respectively. Here, one needs to run a regression (or classification model if treatments/outcomes are discrete), that predicts outcomes and treatments given the covariates and control features. @@ -296,16 +291,17 @@ class DML(MetaEstimatorMixin, BaseEstimator): ---------- .. footbibliography:: """ + def __init__( - self, - estimator, - estimator_outcome=None, - estimator_treatment=None, - cv=None, - random_state=None, - n_jobs=None, - verbose=False, - ) -> None: + self, + estimator, + estimator_outcome=None, + estimator_treatment=None, + cv=None, + random_state=None, + n_jobs=None, + verbose=False, + ) -> None: super().__init__() self.estimator = estimator self.estimator_outcome = estimator_outcome @@ -320,15 +316,11 @@ def fit(self, X, y, *, t=None, groups=None, **fit_params): random_state = check_random_state(self.random_state) # run checks on input data - X, y = self._validate_data( - X, y, multi_output=True, accept_sparse=False - ) - self._validate_data( - y=t, multi_output=True, accept_sparse=False - ) + X, y = self._validate_data(X, y, multi_output=True, accept_sparse=False) + self._validate_data(y=t, multi_output=True, accept_sparse=False) X, y, t, groups = indexable(X, y, t, groups) fit_params = _check_fit_params(X, fit_params) - sample_weight = fit_params.get('sample_weight', None) + sample_weight = fit_params.get("sample_weight", None) # Determine output settings _, self.n_features_in_ = X.shape @@ -346,10 +338,14 @@ def fit(self, X, y, *, t=None, groups=None, **fit_params): final_stage_estimator = clone(self.estimator) # initialize the models for treatment and outcome - estimator_outcome = check_outcome_estimator(estimator_outcome=self.estimator_outcome, y=y, random_state=random_state) - estimator_treatment = check_treatment_estimator(estimator_treatment=self.estimator_treatment, t=t, random_state=random_state) + estimator_outcome = check_outcome_estimator( + estimator_outcome=self.estimator_outcome, y=y, random_state=random_state + ) + estimator_treatment = check_treatment_estimator( + estimator_treatment=self.estimator_treatment, t=t, random_state=random_state + ) - # fit the nuissance functions in parallel + # fit the nuisance functions in parallel parallel = Parallel(n_jobs=self.n_jobs) with parallel: out = parallel( @@ -373,9 +369,9 @@ def fit(self, X, y, *, t=None, groups=None, **fit_params): t_residuals = np.zeros(t.shape) for result in out: - test = result['test_idx'] - y_residuals[test, :] = result['y_residuals'] - t_residuals[test, :] = result['t_residuals'] + test = result["test_idx"] + y_residuals[test, :] = result["y_residuals"] + t_residuals[test, :] = result["t_residuals"] # now fit the final stage estimator final_stage_estimator.fit(X=X, y=y_residuals, t=t_residuals, **fit_params) @@ -391,7 +387,7 @@ def predict(self, X): pass def conditional_effect(self, X, t=None): - """Compute conditional effect of + """Compute conditional effect of Parameters ---------- @@ -401,5 +397,3 @@ def conditional_effect(self, X, t=None): _description_, by default None """ pass - - \ No newline at end of file diff --git a/sktree/causal/tests/test_causal.py b/sktree/causal/tests/test_causal.py index e69de29bb..06a83b2f6 100644 --- a/sktree/causal/tests/test_causal.py +++ b/sktree/causal/tests/test_causal.py @@ -0,0 +1,20 @@ +import joblib +import numpy as np +import pytest +from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_array_equal +from sklearn import datasets +from sklearn.utils.estimator_checks import parametrize_with_checks + +from sktree.causal import CausalTree + + +@parametrize_with_checks( + [ + CausalTree(random_state=12), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + # TODO: remove when we implement Regressor classes + if check.func.__name__ in ["check_requires_y_none"]: + pytest.skip() + check(estimator) diff --git a/sktree/causal/tree.py b/sktree/causal/tree.py index 4e67d4047..f629523e9 100644 --- a/sktree/causal/tree.py +++ b/sktree/causal/tree.py @@ -1,11 +1,12 @@ from sklearn.base import BaseEstimator - from sklearn.ensemble._forest import BaseForest, ForestRegressor -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.tree import BaseDecisionTree, DecisionTreeClassifier, DecisionTreeRegressor class CausalTree(BaseEstimator): - def __init__(self, + def __init__( + self, + tree=None, criterion="gini", splitter="best", max_depth=None, @@ -19,7 +20,58 @@ def __init__(self, class_weight=None, ccp_alpha=0.0, honest_ratio=0.5, + **tree_params, ) -> None: + """A causal tree. + + A causal tree is a decision tree model that operates over not only + ``(X, y)`` tuple of arrays, but also includes the treatment groups. + It is an estimator model that has a graphical model of the form + :math:`T \\rightarrow Y` with :math:`T \\leftarrow X \\rightarrow Y`. + + Parameters + ---------- + tree : _type_, optional + _description_, by default None + criterion : str, optional + _description_, by default "gini" + splitter : str, optional + _description_, by default "best" + max_depth : _type_, optional + _description_, by default None + min_samples_split : int, optional + _description_, by default 2 + min_samples_leaf : int, optional + _description_, by default 1 + min_weight_fraction_leaf : float, optional + _description_, by default 0.0 + max_features : _type_, optional + _description_, by default None + random_state : _type_, optional + _description_, by default None + max_leaf_nodes : _type_, optional + _description_, by default None + min_impurity_decrease : float, optional + _description_, by default 0.0 + class_weight : _type_, optional + _description_, by default None + ccp_alpha : float, optional + _description_, by default 0.0 + honest_ratio : float, optional + _description_, by default 0.5 + + Notes + ----- + Compared to a normal decision tree model, which estimates the value :math:`E[Y | X]`, + a causal decision tree splits the data in such a way to estimate the value + :math:`E[Y | do(T), X]`. This is a causal effect, and thus requires causal assumptions. + The core assumption is that of unconfoundedness, where the outcomes, ``Y``, and the + treatments, ``T`` are not confounded by any other non-observed variables. That is ``X`` + captures all possible confounders. This is typically a strong assumption, but is realized + in well-designed observational studies and randomized control trials, where the treatment + assignment is randomized. + """ + self.tree = tree self.criterion = criterion self.splitter = splitter self.max_depth = max_depth @@ -33,27 +85,52 @@ def __init__(self, self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.honest_ratio = honest_ratio + self.tree_params = tree_params def fit(self, X, y, sample_weight=None, T=None): # initialize the tree estimator - self.estimator_ = DecisionTreeClassifier( - criterion=self.criterion, - splitter=self.splitter, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - min_weight_fraction_leaf=self.min_weight_fraction_leaf, - max_features=self.max_features, - random_state=self.random_state, - max_leaf_nodes=self.max_leaf_nodes, - min_impurity_decrease=self.min_impurity_decrease, - class_weight=self.class_weight, - ccp_alpha=self.ccp_alpha, - honest_ratio=1.0, - ) - + if self.tree is None: + self.estimator_ = DecisionTreeClassifier( + criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=self.max_features, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + class_weight=self.class_weight, + ccp_alpha=self.ccp_alpha, + honest_ratio=1.0, + **self.tree_params, + ) + else: + if not issubclass(self.tree, BaseDecisionTree): + raise RuntimeError( + "The type of tree must be a subclass of sklearn.tree.BaseDecisionTree." + ) + + self.estimator_ = self.tree( + criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=self.max_features, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + class_weight=self.class_weight, + ccp_alpha=self.ccp_alpha, + honest_ratio=1.0, + **self.tree_params, + ) + # fit the decision tree using only the treatment groups - self.estimator_.fit(X, T, sample_weight=sample_weight) + self.estimator_.fit(X, y=T, sample_weight=sample_weight) # re-fit the tree leaves using the outcomes refit_leaves(self.estimator_, X, y) @@ -66,7 +143,7 @@ def predict(self, X): def predict_proba(self, X): return self.estimator_.predict_proba(X) - + def decision_path(self, X): return self.estimator_.decision_path(X) @@ -89,7 +166,6 @@ def feature_importances_(self): (Gini importance). """ return self.estimator_.feature_importances_ - class ForestCausalRegressor(BaseForest): @@ -158,8 +234,7 @@ def predict(self, X): # Parallel loop lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( - delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) - for e in self.estimators_ + delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) for e in self.estimators_ ) y_hat /= len(self.estimators_) @@ -231,16 +306,12 @@ def _compute_partial_dependence_recursion(self, grid, target_features): The value of the partial dependence function on each grid point. """ grid = np.asarray(grid, dtype=DTYPE, order="C") - averaged_predictions = np.zeros( - shape=grid.shape[0], dtype=np.float64, order="C" - ) + averaged_predictions = np.zeros(shape=grid.shape[0], dtype=np.float64, order="C") for tree in self.estimators_: # Note: we don't sum in parallel because the GIL isn't released in # the fast method. - tree.tree_.compute_partial_dependence( - grid, target_features, averaged_predictions - ) + tree.tree_.compute_partial_dependence(grid, target_features, averaged_predictions) # Average over the forest averaged_predictions /= len(self.estimators_) From 165eba3938f9e4b3fc7df6ac5a16935f7dc05e3f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 22 Mar 2023 22:55:11 -0400 Subject: [PATCH 3/4] Cleaned up grf code Signed-off-by: Adam Li --- docs/references.bib | 9 +- sktree/causal/_grf_criterion.pxd | 157 +++++++ sktree/causal/_grf_criterion.pyx | 750 ++++++++++++++++++++++++++++++ sktree/causal/_splitter.pxd | 0 sktree/causal/_splitter.pyx | 0 sktree/causal/_utils.pxd | 31 ++ sktree/causal/_utils.pyx | 278 +++++++++++ sktree/causal/meson.build | 16 + sktree/causal/tests/test_utils.py | 46 ++ 9 files changed, 1286 insertions(+), 1 deletion(-) create mode 100644 sktree/causal/_grf_criterion.pxd create mode 100644 sktree/causal/_grf_criterion.pyx create mode 100644 sktree/causal/_splitter.pxd create mode 100644 sktree/causal/_splitter.pyx create mode 100644 sktree/causal/_utils.pxd create mode 100644 sktree/causal/_utils.pyx create mode 100644 sktree/causal/tests/test_utils.py diff --git a/docs/references.bib b/docs/references.bib index 94825f1ec..989138c7c 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -21,9 +21,16 @@ @article{Meghana2019_geodesicrf % Causal +@article{Athey2016GeneralizedRF, + title = {Generalized random forests}, + author = {Susan Athey and Julie Tibshirani and Stefan Wager}, + journal = {The Annals of Statistics}, + year = {2016} +} + @article{chernozhukov2018double, title = {Double/debiased machine learning for treatment and structural parameters}, author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James}, year = {2018}, publisher = {Oxford University Press Oxford, UK} -} \ No newline at end of file +} diff --git a/sktree/causal/_grf_criterion.pxd b/sktree/causal/_grf_criterion.pxd new file mode 100644 index 000000000..19f43ab4c --- /dev/null +++ b/sktree/causal/_grf_criterion.pxd @@ -0,0 +1,157 @@ +# Licensed under the MIT License. +# Original Authors: +# - EconML +# - Vasilis Syrgkanis +# +# Modified by: Adam Li + +import numpy as np +cimport numpy as np + +from sklearn.tree._tree cimport DTYPE_t # Type of X +from sklearn.tree._tree cimport DOUBLE_t # Type of y, sample_weight +from sklearn.tree._tree cimport SIZE_t # Type for indices and counters + +from sklearn.tree._criterion cimport Criterion, RegressionCriterion + + +cdef class CausalCriterion(RegressionCriterion): + + +cdef class MomentCriterion(RegressionCriterion): +# The A random vector of the linear moment equation for each sample of size (n_samples, n_outputs) + # these are the "weights" that are pre-computed. + cdef const DOUBLE_t[:, ::1] alpha + + # the approximate Jacobian evaluated at every single sample point + # random vector of the linear moment equation + # size (n_samples, n_outputs, n_outputs) + cdef const DOUBLE_t[:, :, ::1] pointJ + + cdef DOUBLE_t[:, ::1] rho # Proxy heterogeneity label: rho = E[J | X in Node]^{-1} m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) + cdef DOUBLE_t[:, ::1] moment # Moment for each sample: m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) + cdef DOUBLE_t[:, ::1] J # Node average jacobian: J(Node) = E[J | X in Node] of shape (n_outputs, n_outputs) + cdef DOUBLE_t[:, ::1] invJ # Inverse of node average jacobian: J(Node)^{-1} of shape (n_outputs, n_outputs) + cdef DOUBLE_t[:] parameter # Estimated node parameter: theta(Node) = E[J|X in Node]^{-1} E[A|X in Node] + cdef DOUBLE_t[:] parameter_pre # Preconditioned node parameter: theta_pre(Node) = E[A | X in Node] + + cdef SIZE_t n_relevant_outputs + + cdef int compute_sample_preparameter( + self, + DOUBLE_t[:] parameter_pre, + const DOUBLE_t[:, ::1] alpha, + DOUBLE_t weight, + SIZE_t sample_index, + ) nogil except -1 + cdef int compute_sample_parameter( + self, + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ + ) nogil except -1 + cdef int compute_sample_moment( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] alpha, + DOUBLE_t[:] parameter, + const DOUBLE_t[:, :, ::1] pointJ, + SIZE_t sample_index + ) except -1 nogil + cdef int compute_sample_rho( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] invJ, + SIZE_t sample_index + ) except -1 nogil + cdef int compute_sample_jacobian( + self, + DOUBLE_t[:, ::1] J, + const DOUBLE_t[:, :, ::1] pointJ, + DOUBLE_t weight, + SIZE_t sample_index, + ) except -1 nogil + cdef int compute_node_inv_jacobian( + self, + DOUBLE_t[:, ::1] J, + DOUBLE_t[:, ::1] invJ, + ) except -1 nogil + +cdef class LinearMomentCriterion(RegressionCriterion): + """ A criterion class that estimates local parameters defined via linear moment equations + of the form: + + E[ m(J, A; theta(x)) | X=x] = E[ J * theta(x) - A | X=x] = 0 + + Calculates impurity based on heterogeneity induced on the estimated parameters, based on the proxy score + defined in the Generalized Random Forest paper: + Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests." + The Annals of Statistics 47.2 (2019): 1148-1178 + https://arxiv.org/pdf/1610.01271.pdf + + **How do we utilize the abstract Criterion data structure?** + - `sum_left`, `sum_right` represent for a sample i and each output k, the rho[i, k] terms per output weighted + by sample_weight[i, k]. + - `sum_total` keeps track of the previous sums. + - `sq_sum_total` keeps track of the square sum total ``sample_weight[i, k] * rho[i, k] * rho[i, k]``. + + - `n_revelant_outputs` should now store the relevant outputs, which should correspond to the dimensionality + of the treatment vector. + + """ + # The A random vector of the linear moment equation for each sample of size (n_samples, n_outputs) + # these are the "weights" that are pre-computed. + cdef const DOUBLE_t[:, ::1] alpha + + # the approximate Jacobian evaluated at every single sample point + # random vector of the linear moment equation + # size (n_samples, n_outputs, n_outputs) + cdef const DOUBLE_t[:, :, ::1] pointJ + + cdef DOUBLE_t[:, ::1] rho # Proxy heterogeneity label: rho = E[J | X in Node]^{-1} m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) + cdef DOUBLE_t[:, ::1] moment # Moment for each sample: m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) + cdef DOUBLE_t[:, ::1] J # Node average jacobian: J(Node) = E[J | X in Node] of shape (n_outputs, n_outputs) + cdef DOUBLE_t[:, ::1] invJ # Inverse of node average jacobian: J(Node)^{-1} of shape (n_outputs, n_outputs) + + cdef int compute_sample_preparameter( + self, + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ, + SIZE_t[:] samples_index, + DOUBLE_t[:] sample_weight, + ) nogil except -1 + cdef int compute_sample_parameter( + self, + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ + ) nogil except -1 + cdef int compute_sample_moment( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] parameter, + const DOUBLE_t[:, :, ::1] pointJ, + SIZE_t sample_idx, + ) except -1 nogil + + cdef int node_reset_jacobian(self, DOUBLE_t* J, DOUBLE_t* invJ, double* weighted_n_node_samples, + const DOUBLE_t[:, ::1] pointJ, + DOUBLE_t* sample_weight, + SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1 + cdef int node_reset_parameter(self, DOUBLE_t* parameter, DOUBLE_t* parameter_pre, + DOUBLE_t* invJ, + const DOUBLE_t[:, ::1] alpha, + DOUBLE_t* sample_weight, double weighted_n_node_samples, + SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1 + cdef int node_reset_rho(self, DOUBLE_t* rho, DOUBLE_t* moment, SIZE_t* node_index_mapping, + DOUBLE_t* parameter, DOUBLE_t* invJ, double weighted_n_node_samples, + const DOUBLE_t[:, ::1] pointJ, const DOUBLE_t[:, ::1] alpha, + DOUBLE_t* sample_weight, SIZE_t* samples, + SIZE_t start, SIZE_t end) nogil except -1 + cdef int node_reset_sums(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* rho, + DOUBLE_t* J, + DOUBLE_t* sample_weight, SIZE_t* samples, + DOUBLE_t* sum_total, DOUBLE_t* var_total, + DOUBLE_t* sq_sum_total, DOUBLE_t* y_sq_sum_total, + SIZE_t start, SIZE_t end) nogil except -1 diff --git a/sktree/causal/_grf_criterion.pyx b/sktree/causal/_grf_criterion.pyx new file mode 100644 index 000000000..35f6c344c --- /dev/null +++ b/sktree/causal/_grf_criterion.pyx @@ -0,0 +1,750 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False + +# +# This code contains a refactoring of the EconML GRF code, +# published under the following license and copyright: +# BSD 3-Clause License +# +# All rights reserved. + +from libc.stdlib cimport calloc +from libc.stdlib cimport free +from libc.string cimport memcpy +from libc.string cimport memset +from libc.math cimport fabs, sqrt + +import numpy as np +cimport numpy as np +np.import_array() + +from ._utils cimport matinv_, pinv_ + +cdef double INFINITY = np.inf + + +cdef class MomentCriterion(RegressionCriterion): + cdef int compute_sample_preparameter( + self, + DOUBLE_t[:] parameter_pre, + const DOUBLE_t[:, ::1] alpha, + DOUBLE_t weight, + SIZE_t sample_index, + ) nogil except -1: + """Calculate the un-normalized pre-conditioned parameter. + + theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i] + theta(node) := J(node)^{-1} theta_pre(node) + + Parameters + ---------- + parameter_pre : DOUBLE_t memoryview of size (n_outputs,) + To be computed node un-normalized pre-conditioned parameter theta_pre(node). + alpha : DOUBLE_t 2D memoryview of size (n_samples, n_outputs) + The memory view that contains the A[i] for each sample i + weight : DOUBLE_t + The weight of the sample. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t i, j, p + cdef DOUBLE_t w = 1.0 + + # compute parameter_pre per output dimension, j as the + # \sum_{i} w[i] * alpha[i, j] + for j in range(self.n_outputs): + parameter_pre[j] += weight * alpha[sample_index, j] + return 0 + + cdef int compute_sample_parameter( + self, + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ + ) nogil except -1: + """Calculate the parameter. + + theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i] + theta(node) := J(node)^{-1} theta_pre(node) + + Parameters + ---------- + parameter : DOUBLE_t memoryview of size (n_outputs,) + To be computed node parameter theta(node). + parameter_pre : DOUBLE_t memoryview of size (n_outputs,) + Already computed node un-normalized pre-conditioned parameter theta_pre(node) + invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs) + Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format. + """ + cdef SIZE_t i, j + + for j in range(self.n_outputs): + for i in range(self.n_outputs): + parameter[i] += invJ[i, j] * parameter_pre[j] + return 0 + + cdef int compute_sample_moment( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] alpha, + DOUBLE_t[:] parameter, + const DOUBLE_t[:, :, ::1] pointJ, + SIZE_t sample_index + ) except -1 nogil: + """Calculate the linear moment and rho for each sample i in the node. + + moment[i] := J[i] * theta(Node) - A[i] + rho[i] := - (J(Node) / weight(node))^{-1} * moment[i] + + Parameters + ---------- + moment : DOUBLE_t 2D memoryview of shape (`n_samples`, `n_outputs`) + Array of moments per sample in node. + alpha : DOUBLE_t[:, ::1] of size (n_samples, n_outputs) + The memory view that contains the A[i] for each sample i + parameter : DOUBLE_t memoryview of shape (`n_outputs`) + Array of computed parameters theta(x) per sample in node. + pointJ : DOUBLE_t[:, :, ::1] 3D memory of size (n_samples, n_outputs, n_outputs) + The memory view that contains the J[i] for each sample i, where J[i] is the + Jacobian array. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t j, k + + # compute the moment for each output + for j in range(self.n_outputs): + moment[sample_index, j] = - alpha[sample_index, j] + for k in range(self.n_outputs): + moment[sample_index, j] += pointJ[sample_index, j, k] * parameter[k] + return 0 + + cdef int compute_sample_rho( + self, + DOUBLE_t[:, ::1] moment, + DOUBLE_t[:, ::1] invJ, + SIZE_t sample_index + ) except -1 nogil: + """Calculate the rho for each sample i in the node. + + moment[i] := J[i] * theta(Node) - A[i] + rho[i] := - (J(Node) / weight(node))^{-1} * moment[i] + + Parameters + ---------- + moment : DOUBLE_t 2D memoryview of shape (`n_samples`, `n_outputs`) + Array of moments per sample in node. + invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs) + Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t j, k + + # compute the moment for each output + for j in range(self.n_outputs): + rho[sample_index, j] = 0.0 + for k in range(self.n_outputs): + rho[sample_index, j] -= ( + invJ[j, k] * \ + moment[sample_index, k] * \ + self.weighted_n_node_samples + ) + return 0 + + cdef int compute_sample_jacobian( + self, + DOUBLE_t[:, ::1] J, + const DOUBLE_t[:, :, ::1] pointJ, + DOUBLE_t weight, + SIZE_t sample_index, + ) except -1 nogil: + """Calculate the un-normalized jacobian for a sample:: + + J(node) := E[J[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] J[i] + + and its inverse J(node)^{-1} (or revert to pseudo-inverse if the matrix is not invertible). For + dimensions n_outputs={1, 2}, we also add a small constant of 1e-6 on the diagonals of J, to ensure + invertibility. For larger dimensions we revert to pseudo-inverse if the matrix is not invertible. + + Parameters + ---------- + J : DOUBLE_t 2D memoryview (n_outputs, n_outputs) + Un-normalized jacobian J(node) in C-contiguous format + pointJ : DOUBLE_t[:, :, ::1] 3D memory of size (n_samples, n_outputs, n_outputs) + The memory view that contains the J[i] for each sample i, where J[i] is the + Jacobian array. + weight : DOUBLE_t + The weight of the sample. + sample_index : SIZE_t + The index of the sample to be used on computation of criteria of the current node. + """ + cdef SIZE_t j, k + + # Calculate un-normalized empirical jacobian + for k in range(self.n_outputs): + for j in range(self.n_outputs): + J[j, k] += w * pointJ[sample_index, j, k] + + return 0 + + cdef int compute_node_inv_jacobian( + self, + DOUBLE_t[:, ::1] J, + DOUBLE_t[:, ::1] invJ, + ) except -1 nogil: + """Calculate the node inverse un-normalized jacobian:: + + Its inverse J(node)^{-1} (or revert to pseudo-inverse if the matrix is not invertible). For + dimensions n_outputs={1, 2}, we also add a small constant of 1e-6 on the diagonals of J, to ensure + invertibility. For larger dimensions we revert to pseudo-inverse if the matrix is not invertible. + + Parameters + ---------- + J : DOUBLE_t 2D memoryview (n_outputs, n_outputs) + Un-normalized jacobian J(node) in C-contiguous format + invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs) + Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format. + """ + cdef SIZE_t k + + # Calculae inverse and store it in invJ + if self.n_outputs <=2: + # Fast closed form inverse calculation + _fast_invJ(J, invJ, self.n_outputs, clip=1e-6) + else: + for k in range(self.n_outputs): + J[k, k] += 1e-6 # add small diagonal constant for invertibility + + # Slower matrix inverse via lapack package + if not matinv_(J, invJ, self.n_outputs): # if matrix is invertible use the inverse + pinv_(J, invJ, self.n_outputs, self.n_outputs) # else calculate pseudo-inverse + + for k in range(self.n_outputs): + J[k, k] -= 1e-6 # remove the invertibility constant + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion and update statistics. + + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + self.start = start + self.end = end + + self.n_node_samples = end - start + + self.sq_sum_total = 0.0 + self.weighted_n_node_samples = 0. + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t y_ik + cdef DOUBLE_t w_y_ik + cdef DOUBLE_t w = 1.0 + + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) + + # Init jacobian matrix to zero + self.J[:] = 0.0 + + # init parameter parameter to zero + self.parameter_pre[:] = 0.0 + self.parameter[:] = 0.0 + + # perform the first loop to aggregate + for p in range(start, end): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = self.sample_weight[i] + self.weighted_n_node_samples += w + + # compute the Jacobian for this sample + self.compute_sample_jacobian( + self.J, self.pointJ, w, i) + + # compute the pre-conditioned parameter + self.compute_sample_preparameter( + self.parameter_pre, + self.alpha, + w, + i + ) + + # compute the inverse-Jacobian + self.compute_node_inv_jacobian(self.J, self.invJ) + + for p in range(start, end): + i = self.sample_indices[p] + if self.sample_weight is not None: + w = self.sample_weight[i] + + self.compute_sample_parameter( + DOUBLE_t[:] parameter, + DOUBLE_t[:] parameter_pre, + DOUBLE_t[:, ::1] invJ + ) + # next compute the moment for this sample + self.compute_sample_moment( + self.moment, + self.alpha, + self.parameter, + self.pointJ, + i + ) + + # compute the pseudo-label, rho + self.compute_sample_rho( + self.moment, + self.invJ, + i + ) + + # compute statistics for the current total sum and square sum + # of the metrics for criterion, in this case the pseudo-label + for k in range(self.n_outputs): + y_ik = self.rho[i, k] + w_y_ik = w * y_ik + + self.sum_total[k] += w_y_ik + + if k < self.n_relevant_outputs + self.sq_sum_total += w_y_ik * y_ik + + # Reset to pos=start + self.reset() + + cdef int update( + self, + SIZE_t new_pos + ) except -1 nogil: + """Updated statistics by moving samples[pos:new_pos] to the left.""" + cdef SIZE_t pos = self.pos + cdef SIZE_t end = self.end + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + + # Update statistics up to new_pos + # + # Given that + # sum_left[x] + sum_right[x] = sum_total[x] + # var_left[x] + var_right[x] = var_total[x] + # and that sum_total and var_total are known, we are going to update + # sum_left and var_left from the direction that require the least amount + # of computations, i.e. from pos to new_pos or from end to new_pos. + + # The invariance of the update is that: + # sum_left[k] = sum_{i in Left} w[i] rho[i, k] + # var_left[k] = sum_{i in Left} w[i] pointJ[i, k, k] + # and similarly for the right child. Notably, the second is un-normalized, + # so to be used for further calculations it needs to be normalized by the child weight. + if (new_pos - pos) <= (end - new_pos): + for p in range(pos, new_pos): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = sample_weight[i] + + for k in range(self.n_outputs): + # we add w[i] * rho[i, k] to sum_left[k] + self.sum_left[k] += w * self.rho[i, k] + # we add w[i] * J[i, k, k] to var_left[k] + # self.var_left[k] += w * self.pointJ[i, k, k] + + self.weighted_n_left += w + else: + self.reverse_reset() + + for p in range(end - 1, new_pos - 1, -1): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = sample_weight[i] + + for k in range(self.n_outputs): + # we subtract w[i] * rho[i, k] from sum_left[k] + self.sum_left[k] -= w * self.rho[i, k] + # we subtract w[i] * J[i, k, k] from var_left[k] + # self.var_left[k] -= w * self.pointJ[i, k, k] + + self.weighted_n_left -= w + + self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_left) + for k in range(self.n_outputs): + self.sum_right[k] = self.sum_total[k] - self.sum_left[k] + self.var_right[k] = self.var_total[k] - self.var_left[k] + + self.pos = new_pos + return 0 + + +cdef class LinearMomentCriterion(RegressionCriterion): + r"""Criterion based on the solution of a linear moment equation. + + A criterion class that estimates local parameters defined via linear moment equations + of the form:: + + E[ m(J, A; theta(x)) | X=x] + + where our specific instance is a linear moment equation:: + + E[ J * theta(x) - A | X=x] = 0 + + where: + + - m( . ; theta(x)) is the moment parametrized by J and A + - J is the jacobian of the node: J(Node) = E[J | X in Node] + - A is the random vector of the linear moment equation for each sample: A[i] + - theta(x): parameters + - X=x: instance of covariates + + Calculates impurity based on heterogeneity induced on the estimated parameters, based on + the proxy score defined in :footcite:`Athey2016GeneralizedRF`. + + Calculates proxy labels for each sample:: + + rho[i] := - J(Node)^{-1} (J[i] * theta(Node) - A[i]) + J(Node) := E[J[i] | X[i] in Node] + theta(Node) := J(Node)^{-1} E[A[i] | X[i] in Node] + + Then uses as proxy_impurity_improvement for a split (Left, Right) the quantity:: + + sum_{k=1}^{n_relevant_outputs} E[rho[i, k] | X[i] in Left]^2 + E[rho[i, k] | X[i] in Right]^2 + + Stores as node impurity the quantity:: + + sum_{k=1}^{n_relevant_outputs} Var(rho[i, k] | X[i] in Node) + = sum_{k=1}^{n_relevant_outputs} E[rho[i, k]^2 | X[i] in Node] - E[rho[i, k] | X[i] in Node]^2 + + """ + + def __cinit__( + self, + SIZE_t n_outputs, + SIZE_t n_samples, + SIZE_t n_relevant_outputs, + SIZE_t n_y, + ): + """Initialize parameters for this criterion. Parent `__cinit__` is always called before children. + So we only perform extra initializations that were not perfomed by the parent classes. + + Parameters + ---------- + n_outputs : SIZE_t + The number of parameters/values to be estimated + n_samples : SIZE_t + The total number of rows in the 2d matrix y. + The total number of samples to fit on. + n_relevant_outputs : SIZE_t + We only care about the first n_relevant_outputs of these parameters/values + n_y : SIZE_t + The first n_y columns of the 2d matrix y, contain the raw labels y_{ik}, the rest are auxiliary variables + """ + + # Most initializations are handled by __cinit__ of RegressionCriterion + # which is always called in cython. We initialize the extras. + if n_y > 1: + raise AttributeError("LinearMomentGRFCriterion currently only supports a scalar y") + + self.proxy_children_impurity = True # The children_impurity() only returns an approximate proxy + + self.n_relevant_outputs = n_relevant_outputs + self.n_y = n_y + + # Allocate memory for the proxy for y, which rho in the generalized random forest + # Since rho is node dependent it needs to be re-calculated and stored for each sample + # in the node for every node we are investigating + self.rho = np.zeros((n_samples, n_outputs), dtype=np.float64) + self.moment = np.zeros((n_samples, n_outputs), dtype=np.float64) + self.J = np.zeros((n_outputs, n_outputs), dtype=np.float64) + self.invJ = np.zeros((n_outputs, n_outputs), dtype=np.float64) + self.parameter = np.zeros(n_outputs, dtype=np.float64) + self.parameter_pre = np.zeros(n_outputs, dtype=np.float64) + + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) nogil except -1: + # Initialize fields + self.y = y + self.sample_weight = sample_weight + self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + cdef SIZE_t n_features = self.n_features + cdef SIZE_t n_outputs = self.n_outputs + cdef SIZE_t n_y = self.n_y + + self.y = y[:, :n_y] # The first n_y columns of y are the original raw outcome + self.alpha = y[:, n_y:(n_y + n_outputs)] # A[i] part of the moment is the next n_outputs columns + # J[i] part of the moment is the next n_outputs * n_outputs columns, stored in Fortran contiguous format + self.pointJ = y[:, (n_y + n_outputs):(n_y + n_outputs + n_outputs * n_outputs)] + self.sample_weight = sample_weight # Store the sample_weight locally + self.samples = samples # Store the sample index structure used and updated by the splitter + self.weighted_n_samples = weighted_n_samples # Store total weight of all samples computed by splitter + + return 0 + + cdef double node_impurity(self) nogil: + """Evaluate the impurity of the current node, i.e. the impurity of + samples[start:end]. We use as node_impurity the proxy quantity: + sum_{k=1}^{n_relevant_outputs} Var(rho[i, k] | i in Node) / n_relevant_outputs + = sum_{k=1}^{n_relevant_outputs} (E[rho[i, k]^2 | i in Node] - E[rho[i, k] | i in Node]^2) / n_relevant_outputs + """ + + cdef double* sum_total = self.sum_total + cdef double impurity + cdef SIZE_t k + + impurity = self.sq_sum_total / self.weighted_n_node_samples + for k in range(self.n_relevant_outputs): + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + + return impurity / self.n_relevant_outputs + + cdef double proxy_impurity_improvement(self) nogil: + """Compute a proxy of the impurity reduction. + + This method is used to speed up the search for the best split. It is a proxy quantity such that the + split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + + Here we use the quantity: + sum_{k=1}^{n_relevant_outputs} sum_{child in {Left, Right}} weight(child) * E[rho[i, k] | i in child]^2 + Since: + E[rho[i, k] | i in child] = sum_{i in child} w[i] rhp[i, k] / weight(child) = sum_child[k] / weight(child) + This simplifies to: + sum_{k=1}^{n_relevant_outputs} sum_{child in {Left, Right}} sum_child[k]^2 / weight(child) + """ + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef SIZE_t k + cdef double proxy_impurity_left = 0.0 + cdef double proxy_impurity_right = 0.0 + + for k in range(self.n_relevant_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] + + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) + + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + """Evaluate the impurity in children nodes, i.e. the impurity of the + left child (samples[start:pos]) and the impurity the right child + (samples[pos:end]). Here we use the proxy child impurity: + impurity_child[k] = sum_{i in child} w[i] rho[i, k]^2 / weight(child) + - (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 + impurity_child = sum_{k in n_relevant_outputs} impurity_child[k] / n_relevant_outputs + """ + + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + cdef SIZE_t* node_index_mapping = self.node_index_mapping + cdef SIZE_t pos = self.pos + cdef SIZE_t start = self.start + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + cdef DOUBLE_t y_ik + + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right + + cdef SIZE_t i, p, k, offset + cdef DOUBLE_t w = 1.0 + + # We calculate: sq_sum_left = sum_{i in child} w[i] rho[i, k]^2 + for p in range(start, pos): + i = samples[p] + offset = node_index_mapping[i] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_relevant_outputs): + y_ik = self.rho[offset * self.n_outputs + k] + sq_sum_left += w * y_ik * y_ik + # We calculate sq_sum_right = sq_sum_total - sq_sum_left + sq_sum_right = self.sq_sum_total - sq_sum_left + + # We normalize each sq_sum_child by the weight of that child + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right + + # We subtract from the impurity_child, the quantity: + # sum_{k in n_relevant_outputs} (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 + # = sum_{k in n_relevant_outputs} (sum_child[k] / weight(child))^2 + for k in range(self.n_relevant_outputs): + impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + + impurity_left[0] /= self.n_relevant_outputs + impurity_right[0] /= self.n_relevant_outputs + + cdef int update(self, SIZE_t new_pos) except -1 nogil: + """Updated statistics by moving samples[pos:new_pos] to the left.""" + cdef const DOUBLE_t[:] sample_weight = self.sample_weight + cdef const SIZE_t[:] sample_indices = self.sample_indices + + cdef SIZE_t* node_index_mapping = self.node_index_mapping + + cdef SIZE_t pos = self.pos + cdef SIZE_t end = self.end + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + + cdef SIZE_t offset + + # Update statistics up to new_pos + # + # Given that + # sum_left[x] + sum_right[x] = sum_total[x] + # var_left[x] + var_right[x] = var_total[x] + # and that sum_total and var_total are known, we are going to update + # sum_left and var_left from the direction that require the least amount + # of computations, i.e. from pos to new_pos or from end to new_pos. + + # The invariance of the update is that: + # sum_left[k] = sum_{i in Left} w[i] rho[i, k] + # var_left[k] = sum_{i in Left} w[i] pointJ[i, k, k] + # and similarly for the right child. Notably, the second is un-normalized, + # so to be used for further calculations it needs to be normalized by the child weight. + if (new_pos - pos) <= (end - new_pos): + for p in range(pos, new_pos): + i = samples[p] + offset = node_index_mapping[i] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + # we add w[i] * rho[i, k] to sum_left[k] + self.sum_left[k] += w * self.rho[i, k] + # we add w[i] * J[i, k, k] to var_left[k] + self.var_left[k] += w * self.pointJ[i, k, k] + + self.weighted_n_left += w + else: + self.reverse_reset() + + for p in range(end - 1, new_pos - 1, -1): + i = sample_indices[p] + offset = node_index_mapping[i] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + # we subtract w[i] * rho[i, k] from sum_left[k] + self.sum_left[k] -= w * self.rho[i, k] + # we subtract w[i] * J[i, k, k] from var_left[k] + self.var_left[k] -= w * self.pointJ[i, k, k] + + self.weighted_n_left -= w + + self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_left) + for k in range(self.n_outputs): + self.sum_right[k] = self.sum_total[k] - self.sum_left[k] + self.var_right[k] = self.var_total[k] - self.var_left[k] + + self.pos = new_pos + return 0 + + cdef void node_value(self, double* dest) nogil: + """Return the estimated node parameter of samples[start:end] into dest.""" + memcpy(dest, self.parameter, self.n_outputs * sizeof(double)) + + cdef void node_jacobian(self, double* dest) nogil: + """Return the node normalized Jacobian of samples[start:end] into dest in a C contiguous format.""" + cdef SIZE_t i, j + # Jacobian is stored in f-contiguous format for fortran. We translate it to c-contiguous for + # user interfacing. Moreover, we normalize by weight(node). + cdef SIZE_t n_outputs = self.n_outputs + for i in range(n_outputs): + for j in range(n_outputs): + dest[i * n_outputs + j] = self.J[i + j * n_outputs] / self.weighted_n_node_samples + + cdef void node_precond(self, double* dest) nogil: + """Return the normalized node preconditioned value of samples[start:end] into dest.""" + cdef SIZE_t i + for i in range(self.n_outputs): + dest[i] = self.parameter_pre[i] / self.weighted_n_node_samples + + + cdef double min_eig_left(self) nogil: + """ Calculate proxy for minimum eigenvalue of jacobian of left child. Here we simply + use the minimum absolute value of the diagonals of the jacobian. This proxy needs to be + super fast as this calculation happens for every candidate split. So we cannot afford + anything that is not very simple calculation. We tried a power iteration approximation + algorithm implemented in `_utils.fast_min_eigv_()`. But even such power iteration algorithm + is slow as it requires calculating a pseudo-inverse first. + """ + cdef int i + cdef double min, abs + min = fabs(self.var_left[0]) + for i in range(self.n_outputs): + abs = fabs(self.var_left[i]) + if abs < min: + min = abs + # The `update()` method maintains that var_left[k] = J_left[k, k], where J_left is the + # un-normalized jacobian of the left child. Thus we normalize by weight(left) + return min / self.weighted_n_left + + cdef double min_eig_right(self) nogil: + """ Calculate proxy for minimum eigenvalue of jacobian of right child + (see min_eig_left for more details). + """ + cdef int i + cdef double min, abs + min = fabs(self.var_right[0]) + for i in range(self.n_outputs): + abs = fabs(self.var_right[i]) + if abs < min: + min = abs + return min / self.weighted_n_right + + + + +cdef void _fast_invJ(DOUBLE_t* J, DOUBLE_t* invJ, SIZE_t n, double clip) nogil: + """Fast inversion of a 2x2 array.""" + cdef double det + if n == 1: + invJ[0] = 1.0 / J[0] if fabs(J[0]) >= clip else 1/clip # Explicit inverse calculation + elif n == 2: + # Explicit inverse calculation + det = J[0] * J[3] - J[1] * J[2] + if fabs(det) < clip: + det = clip + invJ[0] = J[3] / det + invJ[1] = - J[1] / det + invJ[2] = - J[2] / det + invJ[3] = J[0] / det diff --git a/sktree/causal/_splitter.pxd b/sktree/causal/_splitter.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/sktree/causal/_splitter.pyx b/sktree/causal/_splitter.pyx new file mode 100644 index 000000000..e69de29bb diff --git a/sktree/causal/_utils.pxd b/sktree/causal/_utils.pxd new file mode 100644 index 000000000..55808d76e --- /dev/null +++ b/sktree/causal/_utils.pxd @@ -0,0 +1,31 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import numpy as np +cimport numpy as np + +ctypedef np.npy_float64 DTYPE_t # Type of X +ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight +ctypedef np.npy_intp SIZE_t # Type for indices and counters +ctypedef np.npy_int32 INT32_t # Signed 32 bit integer +ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer + +cpdef bint matinv(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] inv_a) nogil + +cdef bint matinv_(DOUBLE_t* a, DOUBLE_t* inv_a, int m) nogil + +cpdef void lstsq(DOUBLE_t[::1,:] a, DOUBLE_t[::1,:] b, DOUBLE_t[::1, :] sol, bint copy_b=*) nogil + +cdef void lstsq_(DOUBLE_t* a, DOUBLE_t* b, DOUBLE_t* sol, int m, int n, int ldb, int nrhs, bint copy_b=*) nogil + +cpdef void pinv(DOUBLE_t[::1,:] a, DOUBLE_t[::1, :] sol) nogil + +cdef void pinv_(DOUBLE_t* a, DOUBLE_t* sol, int m, int n) nogil + +cpdef double fast_max_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil + +cdef double fast_max_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil + +cpdef double fast_min_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil + +cdef double fast_min_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil \ No newline at end of file diff --git a/sktree/causal/_utils.pyx b/sktree/causal/_utils.pyx new file mode 100644 index 000000000..ac536fbbc --- /dev/null +++ b/sktree/causal/_utils.pyx @@ -0,0 +1,278 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +from libc.stdlib cimport free +from libc.stdlib cimport malloc +from libc.stdlib cimport calloc +from libc.stdlib cimport realloc +from libc.string cimport memcpy +from libc.math cimport log as ln +from libc.stdlib cimport abort + +from scipy.linalg.cython_lapack cimport dgelsy, dgetrf, dgetri, dgecon, dlacpy, dlange + +import numpy as np +cimport numpy as np +np.import_array() + +from sklearn.tree._utils cimport rand_int + + +rcond_ = np.finfo(np.float64).eps +cdef inline double RCOND = rcond_ + + +# ============================================================================= +# Linear Algebra Functions +# ============================================================================= + + +cpdef bint matinv(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] inv_a) nogil: + """ Compute matrix inverse and store it in inv_a. + """ + cdef int m, n + m = a.shape[0] + if not (m == a.shape[1]): + raise ValueError("Can only invert square matrices!") + return matinv_(&a[0, 0], &inv_a[0, 0], m) + +cdef bint matinv_(DOUBLE_t* a, DOUBLE_t* inv_a, int m) nogil: + """ Compute matrix inverse of matrix a of size (m, m) and store it in inv_a. + """ + cdef: + int* pivot + DOUBLE_t* work + int lda, INFO, Lwork + bint failed + + lda = m + Lwork = m**2 + pivot = malloc(m * sizeof(int)) + work = malloc(Lwork * sizeof(DOUBLE_t)) + failed = False + if (pivot==NULL or work==NULL): + with gil: + raise MemoryError() + + try: + memcpy(inv_a, a, m * m * sizeof(DOUBLE_t)) + + #Conduct the LU factorization of the array a + dgetrf(&m, &m, inv_a, &lda, pivot, &INFO) + if not (INFO == 0): + failed = True + else: + #Now use the LU factorization and the pivot information to invert + dgetri(&m, inv_a, &lda, pivot, work, &Lwork, &INFO) + if not (INFO == 0): + failed = True + finally: + free(pivot) + free(work) + + return (not failed) + + + +cpdef void lstsq(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] b, DOUBLE_t[::1, :] sol, bint copy_b=True) nogil: + """ Compute solution to least squares problem min ||b - a sol||_2^2, + where a is a matrix of size (m, n), b is (m, nrhs). Store (n, nrhs) solution in sol. + The memory view b, must have at least max(m, n) rows. If m < n, then pad remainder with zeros. + If copy_b=True, then b is left unaltered on output. Otherwise b is altered by this call. + """ + cdef int m, n, nrhs + m = a.shape[0] + n = a.shape[1] + nrhs = b.shape[1] + ldb = b.shape[0] + if ldb < max(m, n): + with gil: + raise ValueError("Matrix b must have first dimension at least max(a.shape[0], a.shape[1]). " + "Please pad with zeros.") + if (sol.shape[0] != n) or (sol.shape[1] != nrhs): + with gil: + raise ValueError("Matrix sol must have dimensions (a.shape[1], b.shape[1]).") + lstsq_(&a[0, 0], &b[0, 0], &sol[0, 0], m, n, ldb, nrhs, copy_b) + + +cdef void lstsq_(DOUBLE_t* a, DOUBLE_t* b, DOUBLE_t* sol, int m, int n, int ldb, int nrhs, bint copy_b=True) nogil: + """ Compute solution to least squares problem min ||b - a sol||_2^2, + where a is a matrix of size (m, n), b is (m, nrhs). Store (n, nrhs) solution in sol. + The leading (row) dimension b, must be at least max(m, n). If m < n, then pad remainder with zeros. + If copy_b=True, then b is left unaltered on output. Otherwise b is altered by this call. + """ + cdef: + int lda, rank, info, lwork, n_out + double rcond + Py_ssize_t i, j + #array pointers + int* jpvt + double* work + double* b_copy + char* UPLO = 'O' #Any letter other then 'U' or 'L' will copy entire array + lda = m + if ldb < max(m, n): + with gil: + raise ValueError("Matrix b must have dimension at least max(a.shape[0], a.shape[1]). " + "Please pad with zeros.") + rcond = max(m, n) * RCOND + jpvt = calloc(n, sizeof(int)) + lwork = max(min(n, m) + 3 * n + 1, 2 * min(n, m) + nrhs) + work = malloc(lwork * sizeof(DOUBLE_t)) + + # TODO. can we avoid all this malloc and copying in our context? + a_copy = calloc(lda * n, sizeof(DOUBLE_t)) + if copy_b: + b_copy = calloc(ldb * nrhs, sizeof(DOUBLE_t)) + else: + b_copy = b + try: + dlacpy(UPLO, &lda, &n, a, &lda, a_copy, &lda) + if copy_b: + dlacpy(UPLO, &ldb, &nrhs, b, &ldb, b_copy, &ldb) + + dgelsy(&m, &n, &nrhs, a_copy, &lda, b_copy, &ldb, + &jpvt[0], &rcond, &rank, &work[0], &lwork, &info) + + for i in range(n): + for j in range(nrhs): + sol[i + j * n] = b_copy[i + j * ldb] + + finally: + free(jpvt) + free(work) + free(a_copy) + if copy_b: + free(b_copy) + +cpdef void pinv(DOUBLE_t[::1,:] a, DOUBLE_t[::1, :] sol) nogil: + """ Compute pseudo-inverse of (m, n) matrix a and store it in (n, m) matrix sol. + Matrix a is left un-altered by this call. + """ + cdef int m = a.shape[0] + cdef int n = a.shape[1] + pinv_(&a[0, 0], &sol[0, 0], m, n) + +cdef void pinv_(DOUBLE_t* a, DOUBLE_t* sol, int m, int n) nogil: + """ Compute pseudo-inverse of (m, n) matrix a and store it in (n, m) matrix sol. + Matrix a is left un-altered by this call. + """ + # TODO. can we avoid this mallon in our context. Maybe create some fixed memory allocations? + cdef int ldb = max(m, n) + cdef double* b = calloc(ldb * m, sizeof(double)) + cdef Py_ssize_t i + for i in range(m): + b[i + i * ldb] = 1.0 + try: + lstsq_(a, b, sol, m, n, ldb, m, copy_b=False) + + finally: + free(b) + + +cpdef double fast_max_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil: + """ Calculate approximation of maximum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state + """ + return fast_max_eigv_(&A[0, 0], A.shape[0], reps, &random_state) + +cdef double fast_max_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil: + """ Calculate approximation of maximum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state + """ + cdef int t, i, j + cdef double normx, Anormx + cdef double* xnew + cdef double* xold + cdef double* temp + xnew = NULL + xold = NULL + + try: + xnew = calloc(n, sizeof(double)) + xold = calloc(n, sizeof(double)) + + if xnew == NULL or xold == NULL: + with gil: + raise MemoryError() + for i in range(n): + xold[i] = (1 - 2*rand_int(0, 2, random_state)) + for t in range(reps): + for i in range(n): + xnew[i] = 0 + for j in range(n): + xnew[i] += A[i + j * n] * xold[j] + temp = xold + xold = xnew + xnew = temp + normx = 0 + Anormx = 0 + for i in range(n): + normx += xnew[i] * xnew[i] + for j in range(n): + Anormx += xnew[i] * A[i + j * n] * xnew[j] + + return Anormx / normx + finally: + free(xnew) + free(xold) + + +cpdef double fast_min_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil: + """ Calculate approximation of minimum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state + """ + return fast_min_eigv_(&A[0, 0], A.shape[0], reps, &random_state) + +cdef double fast_min_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil: + """ Calculate approximation of minimum eigenvalue via randomized power iteration algorithm. + See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf + Use reps repetition and random seed based on random_state. + """ + cdef int t, i, j + cdef double normx, Anormx + cdef double* xnew + cdef double* xold + cdef double* temp + cdef double* update + xnew = NULL + xold = NULL + + try: + xnew = calloc(n, sizeof(double)) + xold = calloc(n, sizeof(double)) + update = calloc(n, sizeof(double)) + + if xnew == NULL or xold == NULL or update == NULL: + with gil: + raise MemoryError() + for i in range(n): + xold[i] = (1 - 2*rand_int(0, 2, random_state)) + for t in range(reps): + lstsq_(A, xold, update, n, n, n, 1, copy_b=False) + for i in range(n): + xnew[i] = 0 + for j in range(n): + xnew[i] += update[i] + temp = xold + xold = xnew + xnew = temp + normx = 0 + Anormx = 0 + for i in range(n): + normx += xnew[i] * xnew[i] + for j in range(n): + Anormx += xnew[i] * A[i + j * n] * xnew[j] + + return Anormx / normx + finally: + free(xnew) + free(xold) + free(update) diff --git a/sktree/causal/meson.build b/sktree/causal/meson.build index 904e85457..784ca03f3 100644 --- a/sktree/causal/meson.build +++ b/sktree/causal/meson.build @@ -1,3 +1,19 @@ +extensions = [ + '_grf_criterion', + '_splitter', + '_utils', +] + +foreach ext: extensions + py3.extension_module(ext, + cython_gen_cpp.process(ext + '.pyx'), + c_args: cython_c_args, + include_directories: [incdir_numpy], + install: true, + subdir: 'sktree/causal', + ) +endforeach + python_sources = [ '__init__.py', 'tree.py', diff --git a/sktree/causal/tests/test_utils.py b/sktree/causal/tests/test_utils.py new file mode 100644 index 000000000..b886f3bad --- /dev/null +++ b/sktree/causal/tests/test_utils.py @@ -0,0 +1,46 @@ +import numpy as np + +from sktree.causal._utils import matinv, lstsq, pinv, fast_max_eigv, fast_min_eigv + + +def test_fast_eigv(self): + rng = np.random.default_rng(123) + + n = 4 + for _ in range(10): + A = rng.normal(0, 1, size=(n, n)) + A = np.asfortranarray(A @ A.T) + apx = fast_min_eigv(A, 5, 123) + opt = np.min(np.linalg.eig(A)[0]) + np.testing.assert_allclose(apx, opt, atol=.01, rtol=.3) + apx = fast_max_eigv(A, 10, 123) + opt = np.max(np.linalg.eig(A)[0]) + np.testing.assert_allclose(apx, opt, atol=.5, rtol=.2) + + +def test_linalg(): + rng = np.random.default_rng(1234) + + for n, m, nrhs in [(3, 3, 3), (3, 2, 1), (3, 1, 2), (1, 4, 2), (3, 4, 5)]: + for _ in range(100): + A = rng.normal(0, 1, size=(n, m)) + y = rng.normal(0, 1, size=(n, nrhs)) + yf = y + if m > n: + yf = np.zeros((m, nrhs)) + yf[:n] = y + ours = np.asfortranarray(np.zeros((m, nrhs))) + lstsq(np.asfortranarray(A), np.asfortranarray(yf.copy()), ours, copy_b=True) + true = np.linalg.lstsq(A, y, rcond=np.finfo(np.float64).eps * max(n, m))[0] + np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0) + + ours = np.asfortranarray(np.zeros(A.T.shape, dtype=np.float64)) + pinv(np.asfortranarray(A), ours) + true = np.linalg.pinv(A) + np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0) + + if n == m: + ours = np.asfortranarray(np.zeros(A.T.shape, dtype=np.float64)) + matinv(np.asfortranarray(A), ours) + true = np.linalg.inv(A) + np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0) From b21c1c1bdec367f5a33a2dad614905ac1dffd84b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 23 Mar 2023 11:02:28 -0400 Subject: [PATCH 4/4] Add finished work until questions are answered for grf Signed-off-by: Adam Li --- sktree/causal/_grf_criterion.pxd | 84 +---- sktree/causal/_grf_criterion.pyx | 551 +++++++++++++++---------------- 2 files changed, 264 insertions(+), 371 deletions(-) diff --git a/sktree/causal/_grf_criterion.pxd b/sktree/causal/_grf_criterion.pxd index 19f43ab4c..02b3fec1f 100644 --- a/sktree/causal/_grf_criterion.pxd +++ b/sktree/causal/_grf_criterion.pxd @@ -15,10 +15,7 @@ from sklearn.tree._tree cimport SIZE_t # Type for indices and counters from sklearn.tree._criterion cimport Criterion, RegressionCriterion -cdef class CausalCriterion(RegressionCriterion): - - -cdef class MomentCriterion(RegressionCriterion): +cdef class GeneralizedMomentCriterion(RegressionCriterion): # The A random vector of the linear moment equation for each sample of size (n_samples, n_outputs) # these are the "weights" that are pre-computed. cdef const DOUBLE_t[:, ::1] alpha @@ -76,82 +73,3 @@ cdef class MomentCriterion(RegressionCriterion): DOUBLE_t[:, ::1] J, DOUBLE_t[:, ::1] invJ, ) except -1 nogil - -cdef class LinearMomentCriterion(RegressionCriterion): - """ A criterion class that estimates local parameters defined via linear moment equations - of the form: - - E[ m(J, A; theta(x)) | X=x] = E[ J * theta(x) - A | X=x] = 0 - - Calculates impurity based on heterogeneity induced on the estimated parameters, based on the proxy score - defined in the Generalized Random Forest paper: - Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests." - The Annals of Statistics 47.2 (2019): 1148-1178 - https://arxiv.org/pdf/1610.01271.pdf - - **How do we utilize the abstract Criterion data structure?** - - `sum_left`, `sum_right` represent for a sample i and each output k, the rho[i, k] terms per output weighted - by sample_weight[i, k]. - - `sum_total` keeps track of the previous sums. - - `sq_sum_total` keeps track of the square sum total ``sample_weight[i, k] * rho[i, k] * rho[i, k]``. - - - `n_revelant_outputs` should now store the relevant outputs, which should correspond to the dimensionality - of the treatment vector. - - """ - # The A random vector of the linear moment equation for each sample of size (n_samples, n_outputs) - # these are the "weights" that are pre-computed. - cdef const DOUBLE_t[:, ::1] alpha - - # the approximate Jacobian evaluated at every single sample point - # random vector of the linear moment equation - # size (n_samples, n_outputs, n_outputs) - cdef const DOUBLE_t[:, :, ::1] pointJ - - cdef DOUBLE_t[:, ::1] rho # Proxy heterogeneity label: rho = E[J | X in Node]^{-1} m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) - cdef DOUBLE_t[:, ::1] moment # Moment for each sample: m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`) - cdef DOUBLE_t[:, ::1] J # Node average jacobian: J(Node) = E[J | X in Node] of shape (n_outputs, n_outputs) - cdef DOUBLE_t[:, ::1] invJ # Inverse of node average jacobian: J(Node)^{-1} of shape (n_outputs, n_outputs) - - cdef int compute_sample_preparameter( - self, - DOUBLE_t[:] parameter, - DOUBLE_t[:] parameter_pre, - DOUBLE_t[:, ::1] invJ, - SIZE_t[:] samples_index, - DOUBLE_t[:] sample_weight, - ) nogil except -1 - cdef int compute_sample_parameter( - self, - DOUBLE_t[:] parameter, - DOUBLE_t[:] parameter_pre, - DOUBLE_t[:, ::1] invJ - ) nogil except -1 - cdef int compute_sample_moment( - self, - DOUBLE_t[:, ::1] moment, - DOUBLE_t[:, ::1] parameter, - const DOUBLE_t[:, :, ::1] pointJ, - SIZE_t sample_idx, - ) except -1 nogil - - cdef int node_reset_jacobian(self, DOUBLE_t* J, DOUBLE_t* invJ, double* weighted_n_node_samples, - const DOUBLE_t[:, ::1] pointJ, - DOUBLE_t* sample_weight, - SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1 - cdef int node_reset_parameter(self, DOUBLE_t* parameter, DOUBLE_t* parameter_pre, - DOUBLE_t* invJ, - const DOUBLE_t[:, ::1] alpha, - DOUBLE_t* sample_weight, double weighted_n_node_samples, - SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1 - cdef int node_reset_rho(self, DOUBLE_t* rho, DOUBLE_t* moment, SIZE_t* node_index_mapping, - DOUBLE_t* parameter, DOUBLE_t* invJ, double weighted_n_node_samples, - const DOUBLE_t[:, ::1] pointJ, const DOUBLE_t[:, ::1] alpha, - DOUBLE_t* sample_weight, SIZE_t* samples, - SIZE_t start, SIZE_t end) nogil except -1 - cdef int node_reset_sums(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* rho, - DOUBLE_t* J, - DOUBLE_t* sample_weight, SIZE_t* samples, - DOUBLE_t* sum_total, DOUBLE_t* var_total, - DOUBLE_t* sq_sum_total, DOUBLE_t* y_sq_sum_total, - SIZE_t start, SIZE_t end) nogil except -1 diff --git a/sktree/causal/_grf_criterion.pyx b/sktree/causal/_grf_criterion.pyx index 35f6c344c..90872859a 100644 --- a/sktree/causal/_grf_criterion.pyx +++ b/sktree/causal/_grf_criterion.pyx @@ -23,8 +23,267 @@ from ._utils cimport matinv_, pinv_ cdef double INFINITY = np.inf +# TODO: OPEN QUESTIONS +# 1. why is alpha = y * T and y * Z in CATE and iV forests? +# 2. why is pointJ = T x T and T x Z? + +cdef class GeneralizedMomentCriterion(RegressionCriterion): + """A generalization of the regression criterion in scikit-learn. + + Generalized criterion with moment equations was introduced in the generalized + random forest paper, which shows how many common forests are trained using this + framework of criterion. + + A criterion class that estimates local parameters defined via moment equations + of the form:: + + E[ m(J, A; theta(x)) | X=x] + + where our specific instance is a linear moment equation:: + + E[ J * theta(x) - A | X=x] = 0 + + where: + + - m(J, A; theta(x)) is the moment equation that is specified per modeling setup. + - J is the Jacobian array of shape (n_outputs, n_outputs) per sample + - A is the alpha weights of shape (n_outputs) per sample + - theta(x) is the parameters we are interested in of shape (n_outputs) per sample + - X is the data matrix of shape (n_samples, n_features) + + We have the following given points: + + - alpha is the weights per sample of shape (n_samples, n_outputs) + - pointJ is the pointwise Jacobian array per sample of shape (n_samples, n_outputs, n_outputs) + + Then we have the following estimating equations: + + J(Node) := E[J[i] | X[i] in Node] = sum_{i in Node} w[i] J[i] + moment[i] := J[i] * theta(Node) - A[i] + rho[i] := - J(Node)^{-1} (J[i] * theta(Node) - A[i]) + theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i] + theta(Node) := J(Node)^{-1} E[A[i] | X[i] in Node] = J(node)^{-1} theta_pre(node) + + Notes + ----- + Calculates impurity based on heterogeneity induced on the estimated parameters, based on + the proxy score defined in :footcite:`Athey2016GeneralizedRF`. + + Specifically, we compute node, proxy and children impurity similarly to a + mean-squared error setting, where these are in general "proxies" for the true + criterion: + + n_{C1} * n_{C2} / n_P^2 (\hat{\\theta}_{C1} - \hat{\\theta}_{C2})^2 + + as specified in Equation 5 of the paper :footcite:`Athey2016GeneralizedRF`. + The proxy is computed with Equation 9 of the paper: + + 1/n_{C1} (\sum_{i \in C1} \\rho_i)^2 + 1/n_{C2} (\sum_{i \in C2} \\rho_i)^2 + + where :math:`\\rho_i` is the pseudo-label for the ith sample. + """ + def __cinit__( + self, + SIZE_t n_outputs, + SIZE_t n_samples, + SIZE_t n_relevant_outputs, + SIZE_t n_y, + ): + """Initialize parameters for this criterion. Parent `__cinit__` is always called before children. + So we only perform extra initializations that were not perfomed by the parent classes. + + Parameters + ---------- + n_outputs : SIZE_t + The number of parameters/values to be estimated + n_samples : SIZE_t + The total number of rows in the 2d matrix y. + The total number of samples to fit on. + n_relevant_outputs : SIZE_t + We only care about the first n_relevant_outputs of these parameters/values + n_y : SIZE_t + The first n_y columns of the 2d matrix y, contain the raw labels y_{ik}, the rest are auxiliary variables + """ + # Most initializations are handled by __cinit__ of RegressionCriterion + # which is always called in cython. We initialize the extras. + if n_y > 1: + raise AttributeError("LinearMomentGRFCriterion currently only supports a scalar y") + + self.proxy_children_impurity = True # The children_impurity() only returns an approximate proxy + + self.n_outputs = + self.n_relevant_outputs = n_relevant_outputs + self.n_y = n_y + + # Allocate memory for the proxy for y, which rho in the generalized random forest + # Since rho is node dependent it needs to be re-calculated and stored for each sample + # in the node for every node we are investigating + self.rho = np.zeros((n_samples, n_outputs), dtype=np.float64) + self.moment = np.zeros((n_samples, n_outputs), dtype=np.float64) + self.J = np.zeros((n_outputs, n_outputs), dtype=np.float64) + self.invJ = np.zeros((n_outputs, n_outputs), dtype=np.float64) + self.parameter = np.zeros(n_outputs, dtype=np.float64) + self.parameter_pre = np.zeros(n_outputs, dtype=np.float64) + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) nogil except -1: + """Initialize the criterion object with data. + + Parameters + ---------- + y : DOUBLE_t 2D memoryview of shape (n_samples, n_y + n_outputs + n_outputs * n_outputs) + The input y array. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + + Notes + ----- + For generalized criterion, the y array has columns associated with the + actual 'y' in the first `n_y` columns, and the next `n_outputs` columns are associated + with the alpha vectors and the next `n_outputs * n_outputs` columns are + associated with the estimated sample Jacobian vectors. + + `n_relevant_outputs` is a number less than or equal to `n_outputs`, which + tracks the relevant outputs. + """ + # Initialize fields + self.y = y + self.sample_weight = sample_weight + self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + cdef SIZE_t n_features = self.n_features + cdef SIZE_t n_outputs = self.n_outputs + cdef SIZE_t n_y = self.n_y + + self.y = y[:, :n_y] # The first n_y columns of y are the original raw outcome + self.alpha = y[:, n_y:(n_y + n_outputs)] # A[i] part of the moment is the next n_outputs columns + # J[i] part of the moment is the next n_outputs * n_outputs columns, stored in Fortran contiguous format + self.pointJ = y[:, (n_y + n_outputs):(n_y + n_outputs + n_outputs * n_outputs)] + self.sample_weight = sample_weight # Store the sample_weight locally + self.samples = samples # Store the sample index structure used and updated by the splitter + self.weighted_n_samples = weighted_n_samples # Store total weight of all samples computed by splitter + + return 0 + + cdef double node_impurity( + self + ) noexcept nogil: + """Evaluate the impurity of the current node, i.e. the impurity of + samples[start:end]. + + This sums up the relevant metric over all "relevant outputs" (`n_relevant_outputs`) + for samples in the node and then normalizes to get the average impurity of the node. + Similarly, `sum_total` stores an (`n_outputs`,) array and should have been computed apriori. + + This distinctly generalizes the scikit-learn Regression Criterion, as the `n_outputs` + contains both `n_relevant_outputs` and extra outputs that are nuisance parameters. But + otherwise, the node impurity calculation follows that of a regression. + """ + + cdef double* sum_total = self.sum_total + cdef double impurity + cdef SIZE_t k + + impurity = self.sq_sum_total / self.weighted_n_node_samples + for k in range(self.n_relevant_outputs): + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + + return impurity / self.n_relevant_outputs + + cdef double proxy_impurity_improvement( + self + ) noexcept nogil: + """Compute a proxy of the impurity reduction. + + This method is used to speed up the search for the best split. It is a proxy quantity such that the + split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + + This sums up the relevant metric over all "relevant outputs" (`n_relevant_outputs`) + for samples in the node and then normalizes to get the average impurity of the node. + Similarly, `sum_left` and `sum_right` stores an (`n_outputs`,) array and should have + been computed apriori. + + This distinctly generalizes the scikit-learn Regression Criterion, as the `n_outputs` + contains both `n_relevant_outputs` and extra outputs that are nuisance parameters. But + otherwise, the node impurity calculation follows that of a regression. + """ + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef SIZE_t k + cdef double proxy_impurity_left = 0.0 + cdef double proxy_impurity_right = 0.0 + + for k in range(self.n_relevant_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] + + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) + + cdef void children_impurity( + self, + double* impurity_left, + double* impurity_right + ) noexept nogil: + """Evaluate the impurity in children nodes, i.e. the impurity of the + left child (samples[start:pos]) and the impurity the right child + (samples[pos:end]). Here we use the proxy child impurity: + impurity_child[k] = sum_{i in child} w[i] rho[i, k]^2 / weight(child) + - (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 + impurity_child = sum_{k in n_relevant_outputs} impurity_child[k] / n_relevant_outputs + """ + cdef SIZE_t pos = self.pos + cdef SIZE_t start = self.start + cdef DOUBLE_t y_ik + + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right + + cdef SIZE_t i, p, k, offset + cdef DOUBLE_t w = 1.0 + + # We calculate: sq_sum_left = sum_{i in child} w[i] rho[i, k]^2 + for p in range(start, pos): + i = self.sample_indices[p] + + if self.sample_weight is not None: + w = self.sample_weight[i] + + for k in range(self.n_relevant_outputs): + y_ik = self.rho[i, k] + sq_sum_left += w * y_ik * y_ik + # We calculate sq_sum_right = sq_sum_total - sq_sum_left + sq_sum_right = self.sq_sum_total - sq_sum_left + + # We normalize each sq_sum_child by the weight of that child + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right + + # We subtract from the impurity_child, the quantity: + # sum_{k in n_relevant_outputs} (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 + # = sum_{k in n_relevant_outputs} (sum_child[k] / weight(child))^2 + for k in range(self.n_relevant_outputs): + impurity_left[0] -= (self.sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (self.sum_right[k] / self.weighted_n_right) ** 2.0 + + impurity_left[0] /= self.n_relevant_outputs + impurity_right[0] /= self.n_relevant_outputs -cdef class MomentCriterion(RegressionCriterion): cdef int compute_sample_preparameter( self, DOUBLE_t[:] parameter_pre, @@ -183,8 +442,8 @@ cdef class MomentCriterion(RegressionCriterion): cdef SIZE_t j, k # Calculate un-normalized empirical jacobian - for k in range(self.n_outputs): - for j in range(self.n_outputs): + for j in range(self.n_outputs): + for k in range(self.n_outputs): J[j, k] += w * pointJ[sample_index, j, k] return 0 @@ -396,288 +655,6 @@ cdef class MomentCriterion(RegressionCriterion): return 0 -cdef class LinearMomentCriterion(RegressionCriterion): - r"""Criterion based on the solution of a linear moment equation. - - A criterion class that estimates local parameters defined via linear moment equations - of the form:: - - E[ m(J, A; theta(x)) | X=x] - - where our specific instance is a linear moment equation:: - - E[ J * theta(x) - A | X=x] = 0 - - where: - - - m( . ; theta(x)) is the moment parametrized by J and A - - J is the jacobian of the node: J(Node) = E[J | X in Node] - - A is the random vector of the linear moment equation for each sample: A[i] - - theta(x): parameters - - X=x: instance of covariates - - Calculates impurity based on heterogeneity induced on the estimated parameters, based on - the proxy score defined in :footcite:`Athey2016GeneralizedRF`. - - Calculates proxy labels for each sample:: - - rho[i] := - J(Node)^{-1} (J[i] * theta(Node) - A[i]) - J(Node) := E[J[i] | X[i] in Node] - theta(Node) := J(Node)^{-1} E[A[i] | X[i] in Node] - - Then uses as proxy_impurity_improvement for a split (Left, Right) the quantity:: - - sum_{k=1}^{n_relevant_outputs} E[rho[i, k] | X[i] in Left]^2 + E[rho[i, k] | X[i] in Right]^2 - - Stores as node impurity the quantity:: - - sum_{k=1}^{n_relevant_outputs} Var(rho[i, k] | X[i] in Node) - = sum_{k=1}^{n_relevant_outputs} E[rho[i, k]^2 | X[i] in Node] - E[rho[i, k] | X[i] in Node]^2 - - """ - - def __cinit__( - self, - SIZE_t n_outputs, - SIZE_t n_samples, - SIZE_t n_relevant_outputs, - SIZE_t n_y, - ): - """Initialize parameters for this criterion. Parent `__cinit__` is always called before children. - So we only perform extra initializations that were not perfomed by the parent classes. - - Parameters - ---------- - n_outputs : SIZE_t - The number of parameters/values to be estimated - n_samples : SIZE_t - The total number of rows in the 2d matrix y. - The total number of samples to fit on. - n_relevant_outputs : SIZE_t - We only care about the first n_relevant_outputs of these parameters/values - n_y : SIZE_t - The first n_y columns of the 2d matrix y, contain the raw labels y_{ik}, the rest are auxiliary variables - """ - - # Most initializations are handled by __cinit__ of RegressionCriterion - # which is always called in cython. We initialize the extras. - if n_y > 1: - raise AttributeError("LinearMomentGRFCriterion currently only supports a scalar y") - - self.proxy_children_impurity = True # The children_impurity() only returns an approximate proxy - - self.n_relevant_outputs = n_relevant_outputs - self.n_y = n_y - - # Allocate memory for the proxy for y, which rho in the generalized random forest - # Since rho is node dependent it needs to be re-calculated and stored for each sample - # in the node for every node we are investigating - self.rho = np.zeros((n_samples, n_outputs), dtype=np.float64) - self.moment = np.zeros((n_samples, n_outputs), dtype=np.float64) - self.J = np.zeros((n_outputs, n_outputs), dtype=np.float64) - self.invJ = np.zeros((n_outputs, n_outputs), dtype=np.float64) - self.parameter = np.zeros(n_outputs, dtype=np.float64) - self.parameter_pre = np.zeros(n_outputs, dtype=np.float64) - - - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices - ) nogil except -1: - # Initialize fields - self.y = y - self.sample_weight = sample_weight - self.sample_indices = sample_indices - self.weighted_n_samples = weighted_n_samples - - cdef SIZE_t n_features = self.n_features - cdef SIZE_t n_outputs = self.n_outputs - cdef SIZE_t n_y = self.n_y - - self.y = y[:, :n_y] # The first n_y columns of y are the original raw outcome - self.alpha = y[:, n_y:(n_y + n_outputs)] # A[i] part of the moment is the next n_outputs columns - # J[i] part of the moment is the next n_outputs * n_outputs columns, stored in Fortran contiguous format - self.pointJ = y[:, (n_y + n_outputs):(n_y + n_outputs + n_outputs * n_outputs)] - self.sample_weight = sample_weight # Store the sample_weight locally - self.samples = samples # Store the sample index structure used and updated by the splitter - self.weighted_n_samples = weighted_n_samples # Store total weight of all samples computed by splitter - - return 0 - - cdef double node_impurity(self) nogil: - """Evaluate the impurity of the current node, i.e. the impurity of - samples[start:end]. We use as node_impurity the proxy quantity: - sum_{k=1}^{n_relevant_outputs} Var(rho[i, k] | i in Node) / n_relevant_outputs - = sum_{k=1}^{n_relevant_outputs} (E[rho[i, k]^2 | i in Node] - E[rho[i, k] | i in Node]^2) / n_relevant_outputs - """ - - cdef double* sum_total = self.sum_total - cdef double impurity - cdef SIZE_t k - - impurity = self.sq_sum_total / self.weighted_n_node_samples - for k in range(self.n_relevant_outputs): - impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 - - return impurity / self.n_relevant_outputs - - cdef double proxy_impurity_improvement(self) nogil: - """Compute a proxy of the impurity reduction. - - This method is used to speed up the search for the best split. It is a proxy quantity such that the - split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms - of the impurity decrease for a given split. The absolute impurity improvement is only computed by the - impurity_improvement method once the best split has been found. - - Here we use the quantity: - sum_{k=1}^{n_relevant_outputs} sum_{child in {Left, Right}} weight(child) * E[rho[i, k] | i in child]^2 - Since: - E[rho[i, k] | i in child] = sum_{i in child} w[i] rhp[i, k] / weight(child) = sum_child[k] / weight(child) - This simplifies to: - sum_{k=1}^{n_relevant_outputs} sum_{child in {Left, Right}} sum_child[k]^2 / weight(child) - """ - - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right - - cdef SIZE_t k - cdef double proxy_impurity_left = 0.0 - cdef double proxy_impurity_right = 0.0 - - for k in range(self.n_relevant_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] - proxy_impurity_right += sum_right[k] * sum_right[k] - - return (proxy_impurity_left / self.weighted_n_left + - proxy_impurity_right / self.weighted_n_right) - - cdef void children_impurity(self, double* impurity_left, - double* impurity_right) nogil: - """Evaluate the impurity in children nodes, i.e. the impurity of the - left child (samples[start:pos]) and the impurity the right child - (samples[pos:end]). Here we use the proxy child impurity: - impurity_child[k] = sum_{i in child} w[i] rho[i, k]^2 / weight(child) - - (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 - impurity_child = sum_{k in n_relevant_outputs} impurity_child[k] / n_relevant_outputs - """ - - cdef DOUBLE_t* sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples - cdef SIZE_t* node_index_mapping = self.node_index_mapping - cdef SIZE_t pos = self.pos - cdef SIZE_t start = self.start - - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right - cdef DOUBLE_t y_ik - - cdef double sq_sum_left = 0.0 - cdef double sq_sum_right - - cdef SIZE_t i, p, k, offset - cdef DOUBLE_t w = 1.0 - - # We calculate: sq_sum_left = sum_{i in child} w[i] rho[i, k]^2 - for p in range(start, pos): - i = samples[p] - offset = node_index_mapping[i] - - if sample_weight != NULL: - w = sample_weight[i] - - for k in range(self.n_relevant_outputs): - y_ik = self.rho[offset * self.n_outputs + k] - sq_sum_left += w * y_ik * y_ik - # We calculate sq_sum_right = sq_sum_total - sq_sum_left - sq_sum_right = self.sq_sum_total - sq_sum_left - - # We normalize each sq_sum_child by the weight of that child - impurity_left[0] = sq_sum_left / self.weighted_n_left - impurity_right[0] = sq_sum_right / self.weighted_n_right - - # We subtract from the impurity_child, the quantity: - # sum_{k in n_relevant_outputs} (sum_{i in child} w[i] * rho[i, k] / weight(child))^2 - # = sum_{k in n_relevant_outputs} (sum_child[k] / weight(child))^2 - for k in range(self.n_relevant_outputs): - impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 - - impurity_left[0] /= self.n_relevant_outputs - impurity_right[0] /= self.n_relevant_outputs - - cdef int update(self, SIZE_t new_pos) except -1 nogil: - """Updated statistics by moving samples[pos:new_pos] to the left.""" - cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef const SIZE_t[:] sample_indices = self.sample_indices - - cdef SIZE_t* node_index_mapping = self.node_index_mapping - - cdef SIZE_t pos = self.pos - cdef SIZE_t end = self.end - cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 - - cdef SIZE_t offset - - # Update statistics up to new_pos - # - # Given that - # sum_left[x] + sum_right[x] = sum_total[x] - # var_left[x] + var_right[x] = var_total[x] - # and that sum_total and var_total are known, we are going to update - # sum_left and var_left from the direction that require the least amount - # of computations, i.e. from pos to new_pos or from end to new_pos. - - # The invariance of the update is that: - # sum_left[k] = sum_{i in Left} w[i] rho[i, k] - # var_left[k] = sum_{i in Left} w[i] pointJ[i, k, k] - # and similarly for the right child. Notably, the second is un-normalized, - # so to be used for further calculations it needs to be normalized by the child weight. - if (new_pos - pos) <= (end - new_pos): - for p in range(pos, new_pos): - i = samples[p] - offset = node_index_mapping[i] - - if sample_weight != NULL: - w = sample_weight[i] - - for k in range(self.n_outputs): - # we add w[i] * rho[i, k] to sum_left[k] - self.sum_left[k] += w * self.rho[i, k] - # we add w[i] * J[i, k, k] to var_left[k] - self.var_left[k] += w * self.pointJ[i, k, k] - - self.weighted_n_left += w - else: - self.reverse_reset() - - for p in range(end - 1, new_pos - 1, -1): - i = sample_indices[p] - offset = node_index_mapping[i] - - if sample_weight != NULL: - w = sample_weight[i] - - for k in range(self.n_outputs): - # we subtract w[i] * rho[i, k] from sum_left[k] - self.sum_left[k] -= w * self.rho[i, k] - # we subtract w[i] * J[i, k, k] from var_left[k] - self.var_left[k] -= w * self.pointJ[i, k, k] - - self.weighted_n_left -= w - - self.weighted_n_right = (self.weighted_n_node_samples - - self.weighted_n_left) - for k in range(self.n_outputs): - self.sum_right[k] = self.sum_total[k] - self.sum_left[k] - self.var_right[k] = self.var_total[k] - self.var_left[k] - - self.pos = new_pos - return 0 - cdef void node_value(self, double* dest) nogil: """Return the estimated node parameter of samples[start:end] into dest.""" memcpy(dest, self.parameter, self.n_outputs * sizeof(double)) @@ -690,7 +667,7 @@ cdef class LinearMomentCriterion(RegressionCriterion): cdef SIZE_t n_outputs = self.n_outputs for i in range(n_outputs): for j in range(n_outputs): - dest[i * n_outputs + j] = self.J[i + j * n_outputs] / self.weighted_n_node_samples + dest[i * n_outputs + j] = self.J[i, j] / self.weighted_n_node_samples cdef void node_precond(self, double* dest) nogil: """Return the normalized node preconditioned value of samples[start:end] into dest.""" @@ -732,8 +709,6 @@ cdef class LinearMomentCriterion(RegressionCriterion): return min / self.weighted_n_right - - cdef void _fast_invJ(DOUBLE_t* J, DOUBLE_t* invJ, SIZE_t n, double clip) nogil: """Fast inversion of a 2x2 array.""" cdef double det