diff --git a/.devpy/cmds.py b/.devpy/cmds.py
index e250281c3..039bc761e 100644
--- a/.devpy/cmds.py
+++ b/.devpy/cmds.py
@@ -4,6 +4,7 @@
 
 import click
 from devpy import util
+from devpy.cmds.meson import get_site_packages
 
 
 @click.command()
@@ -17,7 +18,7 @@ def docs(build_dir, clean=False):
             print(f"Removing `{doc_dir}`")
             shutil.rmtree(doc_dir)
 
-    site_path = util.get_site_packages(build_dir)
+    site_path = get_site_packages()
     if site_path is None:
         print("No built scikit-tree found; run `./dev.py build` first.")
         sys.exit(1)
diff --git a/docs/references.bib b/docs/references.bib
index daddb32f6..989138c7c 100644
--- a/docs/references.bib
+++ b/docs/references.bib
@@ -2,10 +2,10 @@
 % Try to keep this list in alphabetical order based on citing name
 
 @article{Li2019manifold,
-  title={Manifold Oblique Random Forests: Towards Closing the Gap on Convolutional Deep Networks},
-  author={Li, Adam and Perry, Ronan and Huynh, Chester and Tomita, Tyler M and Mehta, Ronak and Arroyo, Jesus and Patsolic, Jesse and Falk, Benjamin and Vogelstein, Joshua T},
-  journal={arXiv preprint arXiv:1909.11799},
-  year={2019}
+  title   = {Manifold Oblique Random Forests: Towards Closing the Gap on Convolutional Deep Networks},
+  author  = {Li, Adam and Perry, Ronan and Huynh, Chester and Tomita, Tyler M and Mehta, Ronak and Arroyo, Jesus and Patsolic, Jesse and Falk, Benjamin and Vogelstein, Joshua T},
+  journal = {arXiv preprint arXiv:1909.11799},
+  year    = {2019}
 }
 
 @article{Meghana2019_geodesicrf,
@@ -17,4 +17,20 @@ @article{Meghana2019_geodesicrf
   publisher = {arXiv},
   year      = {2019},
   copyright = {arXiv.org perpetual, non-exclusive license}
-}
\ No newline at end of file
+}
+
+% Causal
+
+@article{Athey2016GeneralizedRF,
+  title   = {Generalized random forests},
+  author  = {Susan Athey and Julie Tibshirani and Stefan Wager},
+  journal = {The Annals of Statistics},
+  year    = {2016}
+}
+
+@article{chernozhukov2018double,
+  title     = {Double/debiased machine learning for treatment and structural parameters},
+  author    = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James},
+  year      = {2018},
+  publisher = {Oxford University Press Oxford, UK}
+}
diff --git a/sktree/causal/__init__.py b/sktree/causal/__init__.py
new file mode 100644
index 000000000..a9624b99d
--- /dev/null
+++ b/sktree/causal/__init__.py
@@ -0,0 +1 @@
+from .tree import CausalForest, CausalTree
diff --git a/sktree/causal/_grf_criterion.pxd b/sktree/causal/_grf_criterion.pxd
new file mode 100644
index 000000000..02b3fec1f
--- /dev/null
+++ b/sktree/causal/_grf_criterion.pxd
@@ -0,0 +1,75 @@
+# Licensed under the MIT License.
+# Original Authors:
+#   - EconML
+#   - Vasilis Syrgkanis
+#
+# Modified by: Adam Li
+
+import numpy as np
+cimport numpy as np
+
+from sklearn.tree._tree cimport DTYPE_t          # Type of X
+from sklearn.tree._tree cimport DOUBLE_t         # Type of y, sample_weight
+from sklearn.tree._tree cimport SIZE_t           # Type for indices and counters
+
+from sklearn.tree._criterion cimport Criterion, RegressionCriterion
+
+
+cdef class GeneralizedMomentCriterion(RegressionCriterion):
+# The A random vector of the linear moment equation for each sample of size (n_samples, n_outputs)
+    # these are the "weights" that are pre-computed.
+    cdef const DOUBLE_t[:, ::1] alpha       
+
+    # the approximate Jacobian evaluated at every single sample point
+    # random vector of the linear moment equation
+    # size (n_samples, n_outputs, n_outputs)
+    cdef const DOUBLE_t[:, :, ::1] pointJ   
+
+    cdef DOUBLE_t[:, ::1] rho                  # Proxy heterogeneity label: rho = E[J | X in Node]^{-1} m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`)
+    cdef DOUBLE_t[:, ::1] moment               # Moment for each sample: m(J, A; theta(Node)) of shape (`n_samples`, `n_outputs`)
+    cdef DOUBLE_t[:, ::1] J                    # Node average jacobian: J(Node) = E[J | X in Node] of shape (n_outputs, n_outputs)
+    cdef DOUBLE_t[:, ::1] invJ                 # Inverse of node average jacobian: J(Node)^{-1} of shape (n_outputs, n_outputs)
+    cdef DOUBLE_t[:] parameter                 # Estimated node parameter: theta(Node) = E[J|X in Node]^{-1} E[A|X in Node]
+    cdef DOUBLE_t[:] parameter_pre             # Preconditioned node parameter: theta_pre(Node) = E[A | X in Node]
+
+    cdef SIZE_t n_relevant_outputs
+
+    cdef int compute_sample_preparameter(
+        self,
+        DOUBLE_t[:] parameter_pre,
+        const DOUBLE_t[:, ::1] alpha,
+        DOUBLE_t weight,
+        SIZE_t sample_index,
+    ) nogil except -1
+    cdef int compute_sample_parameter(
+        self,
+        DOUBLE_t[:] parameter,
+        DOUBLE_t[:] parameter_pre,
+        DOUBLE_t[:, ::1] invJ
+    ) nogil except -1
+    cdef int compute_sample_moment(
+        self,
+        DOUBLE_t[:, ::1] moment,
+        DOUBLE_t[:, ::1] alpha,
+        DOUBLE_t[:] parameter,
+        const DOUBLE_t[:, :, ::1] pointJ,
+        SIZE_t sample_index
+    ) except -1 nogil
+    cdef int compute_sample_rho(
+        self,
+        DOUBLE_t[:, ::1] moment,
+        DOUBLE_t[:, ::1] invJ,
+        SIZE_t sample_index
+    ) except -1 nogil
+    cdef int compute_sample_jacobian(
+        self,
+        DOUBLE_t[:, ::1] J,
+        const DOUBLE_t[:, :, ::1] pointJ,
+        DOUBLE_t weight,
+        SIZE_t sample_index,
+    ) except -1 nogil
+    cdef int compute_node_inv_jacobian(
+        self,
+        DOUBLE_t[:, ::1] J,
+        DOUBLE_t[:, ::1] invJ,
+    ) except -1 nogil
diff --git a/sktree/causal/_grf_criterion.pyx b/sktree/causal/_grf_criterion.pyx
new file mode 100644
index 000000000..90872859a
--- /dev/null
+++ b/sktree/causal/_grf_criterion.pyx
@@ -0,0 +1,725 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+
+#
+# This code contains a refactoring of the EconML GRF code,
+# published under the following license and copyright:
+# BSD 3-Clause License
+#
+# All rights reserved.
+
+from libc.stdlib cimport calloc
+from libc.stdlib cimport free
+from libc.string cimport memcpy
+from libc.string cimport memset
+from libc.math cimport fabs, sqrt
+
+import numpy as np
+cimport numpy as np
+np.import_array()
+
+from ._utils cimport matinv_, pinv_
+
+cdef double INFINITY = np.inf
+
+# TODO: OPEN QUESTIONS
+# 1. why is alpha = y * T and y * Z in CATE and iV forests?
+# 2. why is pointJ = T x T and T x Z?
+
+cdef class GeneralizedMomentCriterion(RegressionCriterion):
+    """A generalization of the regression criterion in scikit-learn.
+
+    Generalized criterion with moment equations was introduced in the generalized
+    random forest paper, which shows how many common forests are trained using this
+    framework of criterion.
+
+    A criterion class that estimates local parameters defined via moment equations
+    of the form::
+
+        E[ m(J, A; theta(x)) | X=x]
+
+    where our specific instance is a linear moment equation::
+        
+        E[ J * theta(x) - A | X=x] = 0
+
+    where:
+
+    - m(J, A; theta(x)) is the moment equation that is specified per modeling setup.
+    - J is the Jacobian array of shape (n_outputs, n_outputs) per sample
+    - A is the alpha weights of shape (n_outputs) per sample
+    - theta(x) is the parameters we are interested in of shape (n_outputs) per sample
+    - X is the data matrix of shape (n_samples, n_features)
+
+    We have the following given points:
+
+    - alpha is the weights per sample of shape (n_samples, n_outputs)
+    - pointJ is the pointwise Jacobian array per sample of shape (n_samples, n_outputs, n_outputs)
+    
+    Then we have the following estimating equations:
+
+    J(Node) := E[J[i] | X[i] in Node] = sum_{i in Node} w[i] J[i]
+    moment[i] := J[i] * theta(Node) - A[i]
+    rho[i] := - J(Node)^{-1} (J[i] * theta(Node) - A[i])
+    theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i]
+    theta(Node) := J(Node)^{-1} E[A[i] | X[i] in Node] = J(node)^{-1} theta_pre(node)
+
+    Notes
+    -----
+    Calculates impurity based on heterogeneity induced on the estimated parameters, based on
+    the proxy score defined in :footcite:`Athey2016GeneralizedRF`.
+
+    Specifically, we compute node, proxy and children impurity similarly to a
+    mean-squared error setting, where these are in general "proxies" for the true
+    criterion:
+
+        n_{C1} * n_{C2} / n_P^2 (\hat{\\theta}_{C1} - \hat{\\theta}_{C2})^2
+
+    as specified in Equation 5 of the paper :footcite:`Athey2016GeneralizedRF`.
+    The proxy is computed with Equation 9 of the paper:
+
+        1/n_{C1} (\sum_{i \in C1} \\rho_i)^2 + 1/n_{C2} (\sum_{i \in C2} \\rho_i)^2
+
+    where :math:`\\rho_i` is the pseudo-label for the ith sample.
+    """
+    def __cinit__(
+        self,
+        SIZE_t n_outputs,
+        SIZE_t n_samples,
+        SIZE_t n_relevant_outputs,
+        SIZE_t n_y,
+    ):
+        """Initialize parameters for this criterion. Parent `__cinit__` is always called before children.
+        So we only perform extra initializations that were not perfomed by the parent classes.
+
+        Parameters
+        ----------
+        n_outputs : SIZE_t
+            The number of parameters/values to be estimated
+        n_samples : SIZE_t
+            The total number of rows in the 2d matrix y.
+            The total number of samples to fit on.
+        n_relevant_outputs : SIZE_t
+            We only care about the first n_relevant_outputs of these parameters/values
+        n_y : SIZE_t
+            The first n_y columns of the 2d matrix y, contain the raw labels y_{ik}, the rest are auxiliary variables
+        """
+        # Most initializations are handled by __cinit__ of RegressionCriterion
+        # which is always called in cython. We initialize the extras.
+        if n_y > 1:
+            raise AttributeError("LinearMomentGRFCriterion currently only supports a scalar y")
+
+        self.proxy_children_impurity = True     # The children_impurity() only returns an approximate proxy
+
+        self.n_outputs = 
+        self.n_relevant_outputs = n_relevant_outputs
+        self.n_y = n_y
+
+        # Allocate memory for the proxy for y, which rho in the generalized random forest
+        # Since rho is node dependent it needs to be re-calculated and stored for each sample
+        # in the node for every node we are investigating
+        self.rho = np.zeros((n_samples, n_outputs), dtype=np.float64)
+        self.moment = np.zeros((n_samples, n_outputs), dtype=np.float64)
+        self.J = np.zeros((n_outputs, n_outputs), dtype=np.float64)
+        self.invJ = np.zeros((n_outputs, n_outputs), dtype=np.float64)
+        self.parameter = np.zeros(n_outputs, dtype=np.float64)
+        self.parameter_pre = np.zeros(n_outputs, dtype=np.float64)
+
+    cdef int init(
+        self,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        double weighted_n_samples,
+        const SIZE_t[:] sample_indices
+    ) nogil except -1:
+        """Initialize the criterion object with data.
+        
+        Parameters
+        ----------
+        y : DOUBLE_t 2D memoryview of shape (n_samples, n_y + n_outputs + n_outputs * n_outputs)
+            The input y array.
+        sample_weight : ndarray, dtype=DOUBLE_t
+            The weight of each sample stored as a Cython memoryview.
+        weighted_n_samples : double
+            The total weight of the samples being considered
+        sample_indices : ndarray, dtype=SIZE_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
+
+        Notes
+        -----
+        For generalized criterion, the y array has columns associated with the
+        actual 'y' in the first `n_y` columns, and the next `n_outputs` columns are associated
+        with the alpha vectors and the next `n_outputs * n_outputs` columns are
+        associated with the estimated sample Jacobian vectors.
+
+        `n_relevant_outputs` is a number less than or equal to `n_outputs`, which
+        tracks the relevant outputs.
+        """
+        # Initialize fields
+        self.y = y
+        self.sample_weight = sample_weight
+        self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        cdef SIZE_t n_features = self.n_features
+        cdef SIZE_t n_outputs = self.n_outputs
+        cdef SIZE_t n_y = self.n_y
+
+        self.y = y[:, :n_y]                     # The first n_y columns of y are the original raw outcome
+        self.alpha = y[:, n_y:(n_y + n_outputs)]        # A[i] part of the moment is the next n_outputs columns
+        # J[i] part of the moment is the next n_outputs * n_outputs columns, stored in Fortran contiguous format
+        self.pointJ = y[:, (n_y + n_outputs):(n_y + n_outputs + n_outputs * n_outputs)]
+        self.sample_weight = sample_weight      # Store the sample_weight locally
+        self.samples = samples                  # Store the sample index structure used and updated by the splitter
+        self.weighted_n_samples = weighted_n_samples    # Store total weight of all samples computed by splitter
+
+        return 0
+
+    cdef double node_impurity(
+        self
+    ) noexcept nogil:
+        """Evaluate the impurity of the current node, i.e. the impurity of
+        samples[start:end].
+        
+        This sums up the relevant metric over all "relevant outputs" (`n_relevant_outputs`)
+        for samples in the node and then normalizes to get the average impurity of the node.
+        Similarly, `sum_total` stores an (`n_outputs`,) array and should have been computed apriori.
+
+        This distinctly generalizes the scikit-learn Regression Criterion, as the `n_outputs`
+        contains both `n_relevant_outputs` and extra outputs that are nuisance parameters. But
+        otherwise, the node impurity calculation follows that of a regression.
+        """
+
+        cdef double* sum_total = self.sum_total
+        cdef double impurity
+        cdef SIZE_t k
+
+        impurity = self.sq_sum_total / self.weighted_n_node_samples
+        for k in range(self.n_relevant_outputs):
+            impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0
+
+        return impurity / self.n_relevant_outputs
+
+    cdef double proxy_impurity_improvement(
+        self
+    ) noexcept nogil:
+        """Compute a proxy of the impurity reduction.
+
+        This method is used to speed up the search for the best split. It is a proxy quantity such that the
+        split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms
+        of the impurity decrease for a given split. The absolute impurity improvement is only computed by the
+        impurity_improvement method once the best split has been found.
+
+        This sums up the relevant metric over all "relevant outputs" (`n_relevant_outputs`)
+        for samples in the node and then normalizes to get the average impurity of the node.
+        Similarly, `sum_left` and `sum_right` stores an (`n_outputs`,) array and should have
+        been computed apriori.
+
+        This distinctly generalizes the scikit-learn Regression Criterion, as the `n_outputs`
+        contains both `n_relevant_outputs` and extra outputs that are nuisance parameters. But
+        otherwise, the node impurity calculation follows that of a regression.
+        """
+
+        cdef double* sum_left = self.sum_left
+        cdef double* sum_right = self.sum_right
+
+        cdef SIZE_t k
+        cdef double proxy_impurity_left = 0.0
+        cdef double proxy_impurity_right = 0.0
+
+        for k in range(self.n_relevant_outputs):
+            proxy_impurity_left += sum_left[k] * sum_left[k]
+            proxy_impurity_right += sum_right[k] * sum_right[k]
+
+        return (proxy_impurity_left / self.weighted_n_left +
+                proxy_impurity_right / self.weighted_n_right)
+
+    cdef void children_impurity(
+        self,
+        double* impurity_left,
+        double* impurity_right
+    ) noexept nogil:
+        """Evaluate the impurity in children nodes, i.e. the impurity of the
+        left child (samples[start:pos]) and the impurity the right child
+        (samples[pos:end]). Here we use the proxy child impurity:
+            impurity_child[k] = sum_{i in child} w[i] rho[i, k]^2 / weight(child)
+                                - (sum_{i in child} w[i] * rho[i, k] / weight(child))^2
+            impurity_child = sum_{k in n_relevant_outputs} impurity_child[k] / n_relevant_outputs
+        """
+        cdef SIZE_t pos = self.pos
+        cdef SIZE_t start = self.start
+        cdef DOUBLE_t y_ik
+
+        cdef double sq_sum_left = 0.0
+        cdef double sq_sum_right
+
+        cdef SIZE_t i, p, k, offset
+        cdef DOUBLE_t w = 1.0
+
+        # We calculate: sq_sum_left = sum_{i in child} w[i] rho[i, k]^2
+        for p in range(start, pos):
+            i = self.sample_indices[p]
+
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
+
+            for k in range(self.n_relevant_outputs):
+                y_ik = self.rho[i, k]
+                sq_sum_left += w * y_ik * y_ik
+        # We calculate sq_sum_right = sq_sum_total - sq_sum_left
+        sq_sum_right = self.sq_sum_total - sq_sum_left
+
+        # We normalize each sq_sum_child by the weight of that child
+        impurity_left[0] = sq_sum_left / self.weighted_n_left
+        impurity_right[0] = sq_sum_right / self.weighted_n_right
+
+        # We subtract from the impurity_child, the quantity:
+        # sum_{k in n_relevant_outputs} (sum_{i in child} w[i] * rho[i, k] / weight(child))^2
+        #   = sum_{k in n_relevant_outputs} (sum_child[k] / weight(child))^2
+        for k in range(self.n_relevant_outputs):
+            impurity_left[0] -= (self.sum_left[k] / self.weighted_n_left) ** 2.0
+            impurity_right[0] -= (self.sum_right[k] / self.weighted_n_right) ** 2.0
+
+        impurity_left[0] /= self.n_relevant_outputs
+        impurity_right[0] /= self.n_relevant_outputs
+
+    cdef int compute_sample_preparameter(
+        self,
+        DOUBLE_t[:] parameter_pre,
+        const DOUBLE_t[:, ::1] alpha,
+        DOUBLE_t weight,
+        SIZE_t sample_index,
+    ) nogil except -1:
+        """Calculate the un-normalized pre-conditioned parameter.
+
+            theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i]
+            theta(node) := J(node)^{-1} theta_pre(node)
+
+        Parameters
+        ----------
+        parameter_pre : DOUBLE_t memoryview of size (n_outputs,)
+            To be computed node un-normalized pre-conditioned parameter theta_pre(node).
+        alpha : DOUBLE_t 2D memoryview of size (n_samples, n_outputs)
+            The memory view that contains the A[i] for each sample i
+        weight : DOUBLE_t
+            The weight of the sample.
+        sample_index : SIZE_t
+            The index of the sample to be used on computation of criteria of the current node.
+        """
+        cdef SIZE_t i, j, p
+        cdef DOUBLE_t w = 1.0
+
+        # compute parameter_pre per output dimension, j as the
+        # \sum_{i} w[i] * alpha[i, j]
+        for j in range(self.n_outputs):
+            parameter_pre[j] += weight * alpha[sample_index, j]
+        return 0
+
+    cdef int compute_sample_parameter(
+        self,
+        DOUBLE_t[:] parameter,
+        DOUBLE_t[:] parameter_pre,
+        DOUBLE_t[:, ::1] invJ
+    ) nogil except -1:
+        """Calculate the parameter.
+
+            theta_pre(node) := E[A[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] A[i]
+            theta(node) := J(node)^{-1} theta_pre(node)
+
+        Parameters
+        ----------
+        parameter : DOUBLE_t memoryview of size (n_outputs,)
+            To be computed node parameter theta(node).
+        parameter_pre : DOUBLE_t memoryview of size (n_outputs,)
+            Already computed node un-normalized pre-conditioned parameter theta_pre(node)
+        invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs)
+            Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format.
+        """
+        cdef SIZE_t i, j
+
+        for j in range(self.n_outputs):
+            for i in range(self.n_outputs):
+                parameter[i] += invJ[i, j] * parameter_pre[j]
+        return 0
+
+    cdef int compute_sample_moment(
+        self,
+        DOUBLE_t[:, ::1] moment,
+        DOUBLE_t[:, ::1] alpha,
+        DOUBLE_t[:] parameter,
+        const DOUBLE_t[:, :, ::1] pointJ,
+        SIZE_t sample_index
+    ) except -1 nogil:
+        """Calculate the linear moment and rho for each sample i in the node.
+
+            moment[i] := J[i] * theta(Node) - A[i]
+            rho[i] := - (J(Node) / weight(node))^{-1} * moment[i]
+
+        Parameters
+        ----------
+        moment : DOUBLE_t 2D memoryview of shape (`n_samples`, `n_outputs`)
+            Array of moments per sample in node.
+        alpha : DOUBLE_t[:, ::1] of size (n_samples, n_outputs)
+            The memory view that contains the A[i] for each sample i
+        parameter : DOUBLE_t memoryview of shape (`n_outputs`)
+            Array of computed parameters theta(x) per sample in node.
+        pointJ : DOUBLE_t[:, :, ::1] 3D memory of size (n_samples, n_outputs, n_outputs)
+            The memory view that contains the J[i] for each sample i, where J[i] is the
+            Jacobian array.
+        sample_index : SIZE_t
+            The index of the sample to be used on computation of criteria of the current node.
+        """
+        cdef SIZE_t j, k
+
+        # compute the moment for each output
+        for j in range(self.n_outputs):
+            moment[sample_index, j] = - alpha[sample_index, j]
+            for k in range(self.n_outputs):
+                moment[sample_index, j] += pointJ[sample_index, j, k] * parameter[k]
+        return 0
+
+    cdef int compute_sample_rho(
+        self,
+        DOUBLE_t[:, ::1] moment,
+        DOUBLE_t[:, ::1] invJ,
+        SIZE_t sample_index
+    ) except -1 nogil:
+        """Calculate the rho for each sample i in the node.
+
+            moment[i] := J[i] * theta(Node) - A[i]
+            rho[i] := - (J(Node) / weight(node))^{-1} * moment[i]
+
+        Parameters
+        ----------
+        moment : DOUBLE_t 2D memoryview of shape (`n_samples`, `n_outputs`)
+            Array of moments per sample in node.
+        invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs)
+            Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format.
+        sample_index : SIZE_t
+            The index of the sample to be used on computation of criteria of the current node.
+        """
+        cdef SIZE_t j, k
+
+        # compute the moment for each output
+        for j in range(self.n_outputs):
+            rho[sample_index, j] = 0.0
+            for k in range(self.n_outputs):
+                rho[sample_index, j] -= (
+                    invJ[j, k] * \
+                    moment[sample_index, k] * \
+                    self.weighted_n_node_samples
+                )
+        return 0
+
+    cdef int compute_sample_jacobian(
+        self,
+        DOUBLE_t[:, ::1] J,
+        const DOUBLE_t[:, :, ::1] pointJ,
+        DOUBLE_t weight,
+        SIZE_t sample_index,
+    ) except -1 nogil:
+        """Calculate the un-normalized jacobian for a sample::
+
+            J(node) := E[J[i] | X[i] in Node] weight(node) = sum_{i in Node} w[i] J[i]
+
+        and its inverse J(node)^{-1} (or revert to pseudo-inverse if the matrix is not invertible). For
+        dimensions n_outputs={1, 2}, we also add a small constant of 1e-6 on the diagonals of J, to ensure
+        invertibility. For larger dimensions we revert to pseudo-inverse if the matrix is not invertible.
+
+        Parameters
+        ----------
+        J : DOUBLE_t 2D memoryview (n_outputs, n_outputs)
+            Un-normalized jacobian J(node) in C-contiguous format
+        pointJ : DOUBLE_t[:, :, ::1] 3D memory of size (n_samples, n_outputs, n_outputs)
+            The memory view that contains the J[i] for each sample i, where J[i] is the
+            Jacobian array.
+        weight : DOUBLE_t
+            The weight of the sample.
+        sample_index : SIZE_t
+            The index of the sample to be used on computation of criteria of the current node.
+        """
+        cdef SIZE_t j, k
+
+        # Calculate un-normalized empirical jacobian
+        for j in range(self.n_outputs):
+            for k in range(self.n_outputs):
+                J[j, k] += w * pointJ[sample_index, j, k]
+
+        return 0
+
+    cdef int compute_node_inv_jacobian(
+        self,
+        DOUBLE_t[:, ::1] J,
+        DOUBLE_t[:, ::1] invJ,
+    ) except -1 nogil:
+        """Calculate the node inverse un-normalized jacobian::
+
+        Its inverse J(node)^{-1} (or revert to pseudo-inverse if the matrix is not invertible). For
+        dimensions n_outputs={1, 2}, we also add a small constant of 1e-6 on the diagonals of J, to ensure
+        invertibility. For larger dimensions we revert to pseudo-inverse if the matrix is not invertible.
+
+        Parameters
+        ----------
+        J : DOUBLE_t 2D memoryview (n_outputs, n_outputs)
+            Un-normalized jacobian J(node) in C-contiguous format
+        invJ : DOUBLE_t 2D memoryview of size (n_outputs, n_outputs)
+            Unnormalized node jacobian inverse J(node)^{-1} in C-contiguous format.
+        """
+        cdef SIZE_t k
+
+        # Calculae inverse and store it in invJ
+        if self.n_outputs <=2:
+            # Fast closed form inverse calculation
+            _fast_invJ(J, invJ, self.n_outputs, clip=1e-6)
+        else:
+            for k in range(self.n_outputs):
+                J[k, k] += 1e-6    # add small diagonal constant for invertibility
+
+            # Slower matrix inverse via lapack package
+            if not matinv_(J, invJ, self.n_outputs):     # if matrix is invertible use the inverse
+                pinv_(J, invJ, self.n_outputs, self.n_outputs)    # else calculate pseudo-inverse
+
+            for k in range(self.n_outputs):
+                J[k, k] -= 1e-6    # remove the invertibility constant
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion and update statistics.
+        
+        The dataset array that we compute criteria on is assumed to consist of 'N' 
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
+        use sample pointers to move the start and end around to consider only a subset of data. 
+        This function should also update relevant statistics that the class uses to compute the final criterion.
+
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to be used on computation of criteria of the current node.
+        end : SIZE_t
+            The last sample used on this node
+        """
+        self.start = start
+        self.end = end
+
+        self.n_node_samples = end - start
+
+        self.sq_sum_total = 0.0
+        self.weighted_n_node_samples = 0.
+
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t y_ik
+        cdef DOUBLE_t w_y_ik
+        cdef DOUBLE_t w = 1.0
+
+        memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double))
+
+        # Init jacobian matrix to zero
+        self.J[:] = 0.0
+
+        # init parameter parameter to zero
+        self.parameter_pre[:] = 0.0
+        self.parameter[:] = 0.0
+
+        # perform the first loop to aggregate
+        for p in range(start, end):
+            i = self.sample_indices[p]
+
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
+            self.weighted_n_node_samples += w
+            
+            # compute the Jacobian for this sample
+            self.compute_sample_jacobian(
+                self.J, self.pointJ, w, i)
+
+            # compute the pre-conditioned parameter
+            self.compute_sample_preparameter(
+                self.parameter_pre,
+                self.alpha,
+                w,
+                i
+            )
+
+        # compute the inverse-Jacobian
+        self.compute_node_inv_jacobian(self.J, self.invJ)
+
+        for p in range(start, end):
+            i = self.sample_indices[p]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
+
+            self.compute_sample_parameter(
+                DOUBLE_t[:] parameter,
+                DOUBLE_t[:] parameter_pre,
+                DOUBLE_t[:, ::1] invJ
+            )
+            # next compute the moment for this sample
+            self.compute_sample_moment(
+                self.moment,
+                self.alpha,
+                self.parameter,
+                self.pointJ,
+                i
+            )
+
+            # compute the pseudo-label, rho
+            self.compute_sample_rho(
+                self.moment,
+                self.invJ,
+                i
+            )
+
+            # compute statistics for the current total sum and square sum
+            # of the metrics for criterion, in this case the pseudo-label
+            for k in range(self.n_outputs):
+                y_ik = self.rho[i, k]
+                w_y_ik = w * y_ik
+
+                self.sum_total[k] += w_y_ik
+
+                if k < self.n_relevant_outputs
+                self.sq_sum_total += w_y_ik * y_ik
+
+        # Reset to pos=start
+        self.reset()
+
+    cdef int update(
+        self,
+        SIZE_t new_pos
+    ) except -1 nogil:
+        """Updated statistics by moving samples[pos:new_pos] to the left."""
+        cdef SIZE_t pos = self.pos
+        cdef SIZE_t end = self.end
+        cdef SIZE_t i, p, k
+        cdef DOUBLE_t w = 1.0
+
+        # Update statistics up to new_pos
+        #
+        # Given that
+        #           sum_left[x] + sum_right[x] = sum_total[x]
+        #           var_left[x] + var_right[x] = var_total[x]
+        # and that sum_total and var_total are known, we are going to update
+        # sum_left and var_left from the direction that require the least amount
+        # of computations, i.e. from pos to new_pos or from end to new_pos.
+        
+        # The invariance of the update is that:
+        #   sum_left[k] = sum_{i in Left} w[i] rho[i, k]
+        #   var_left[k] = sum_{i in Left} w[i] pointJ[i, k, k]
+        # and similarly for the right child. Notably, the second is un-normalized,
+        # so to be used for further calculations it needs to be normalized by the child weight.
+        if (new_pos - pos) <= (end - new_pos):
+            for p in range(pos, new_pos):
+                i = self.sample_indices[p]
+
+                if self.sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    # we add w[i] * rho[i, k] to sum_left[k]
+                    self.sum_left[k] += w * self.rho[i, k]
+                    # we add w[i] * J[i, k, k] to var_left[k]
+                    # self.var_left[k] += w * self.pointJ[i, k, k]
+
+                self.weighted_n_left += w
+        else:
+            self.reverse_reset()
+
+            for p in range(end - 1, new_pos - 1, -1):
+                i = self.sample_indices[p]
+
+                if self.sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    # we subtract w[i] * rho[i, k] from sum_left[k]
+                    self.sum_left[k] -= w * self.rho[i, k]
+                    # we subtract w[i] * J[i, k, k] from var_left[k]
+                    # self.var_left[k] -= w * self.pointJ[i, k, k]
+
+                self.weighted_n_left -= w
+
+        self.weighted_n_right = (self.weighted_n_node_samples -
+                                 self.weighted_n_left)
+        for k in range(self.n_outputs):
+            self.sum_right[k] = self.sum_total[k] - self.sum_left[k]
+            self.var_right[k] = self.var_total[k] - self.var_left[k]
+
+        self.pos = new_pos
+        return 0
+
+
+    cdef void node_value(self, double* dest) nogil:
+        """Return the estimated node parameter of samples[start:end] into dest."""
+        memcpy(dest, self.parameter, self.n_outputs * sizeof(double))
+
+    cdef void node_jacobian(self, double* dest) nogil:
+        """Return the node normalized Jacobian of samples[start:end] into dest in a C contiguous format."""
+        cdef SIZE_t i, j 
+        # Jacobian is stored in f-contiguous format for fortran. We translate it to c-contiguous for
+        # user interfacing. Moreover, we normalize by weight(node).
+        cdef SIZE_t n_outputs = self.n_outputs
+        for i in range(n_outputs):
+            for j in range(n_outputs):
+                dest[i * n_outputs + j] = self.J[i, j] / self.weighted_n_node_samples
+        
+    cdef void node_precond(self, double* dest) nogil:
+        """Return the normalized node preconditioned value of samples[start:end] into dest."""
+        cdef SIZE_t i
+        for i in range(self.n_outputs):
+            dest[i] = self.parameter_pre[i] / self.weighted_n_node_samples
+
+    
+    cdef double min_eig_left(self) nogil:
+        """ Calculate proxy for minimum eigenvalue of jacobian of left child. Here we simply
+        use the minimum absolute value of the diagonals of the jacobian. This proxy needs to be
+        super fast as this calculation happens for every candidate split. So we cannot afford
+        anything that is not very simple calculation. We tried a power iteration approximation
+        algorithm implemented in `_utils.fast_min_eigv_()`. But even such power iteration algorithm
+        is slow as it requires calculating a pseudo-inverse first.
+        """
+        cdef int i
+        cdef double min, abs
+        min = fabs(self.var_left[0])
+        for i in range(self.n_outputs):
+            abs = fabs(self.var_left[i])
+            if abs < min:
+                min = abs
+        # The `update()` method maintains that var_left[k] = J_left[k, k], where J_left is the
+        # un-normalized jacobian of the left child. Thus we normalize by weight(left)
+        return min / self.weighted_n_left
+
+    cdef double min_eig_right(self) nogil:
+        """ Calculate proxy for minimum eigenvalue of jacobian of right child
+        (see min_eig_left for more details).
+        """
+        cdef int i
+        cdef double min, abs
+        min = fabs(self.var_right[0])
+        for i in range(self.n_outputs):
+            abs = fabs(self.var_right[i])
+            if abs < min:
+                min = abs
+        return min / self.weighted_n_right
+
+
+cdef void _fast_invJ(DOUBLE_t* J, DOUBLE_t* invJ, SIZE_t n, double clip) nogil:
+    """Fast inversion of a 2x2 array."""
+    cdef double det
+    if n == 1:
+        invJ[0] = 1.0 / J[0] if fabs(J[0]) >= clip else 1/clip     # Explicit inverse calculation
+    elif n == 2:
+        # Explicit inverse calculation
+        det = J[0] * J[3] - J[1] * J[2]
+        if fabs(det) < clip:
+            det = clip
+        invJ[0] = J[3] / det
+        invJ[1] = - J[1] / det
+        invJ[2] = - J[2] / det
+        invJ[3] = J[0] / det
diff --git a/sktree/causal/_splitter.pxd b/sktree/causal/_splitter.pxd
new file mode 100644
index 000000000..e69de29bb
diff --git a/sktree/causal/_splitter.pyx b/sktree/causal/_splitter.pyx
new file mode 100644
index 000000000..e69de29bb
diff --git a/sktree/causal/_utils.pxd b/sktree/causal/_utils.pxd
new file mode 100644
index 000000000..55808d76e
--- /dev/null
+++ b/sktree/causal/_utils.pxd
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+cimport numpy as np
+
+ctypedef np.npy_float64 DTYPE_t          # Type of X
+ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
+ctypedef np.npy_intp SIZE_t              # Type for indices and counters
+ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
+ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+
+cpdef bint matinv(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] inv_a) nogil
+
+cdef bint matinv_(DOUBLE_t* a, DOUBLE_t* inv_a, int m) nogil
+
+cpdef void lstsq(DOUBLE_t[::1,:] a, DOUBLE_t[::1,:] b, DOUBLE_t[::1, :] sol, bint copy_b=*) nogil
+
+cdef void lstsq_(DOUBLE_t* a, DOUBLE_t* b, DOUBLE_t* sol, int m, int n, int ldb, int nrhs, bint copy_b=*) nogil
+
+cpdef void pinv(DOUBLE_t[::1,:] a, DOUBLE_t[::1, :] sol) nogil
+
+cdef void pinv_(DOUBLE_t* a, DOUBLE_t* sol, int m, int n) nogil
+
+cpdef double fast_max_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil
+
+cdef double fast_max_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil
+
+cpdef double fast_min_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil
+
+cdef double fast_min_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil
\ No newline at end of file
diff --git a/sktree/causal/_utils.pyx b/sktree/causal/_utils.pyx
new file mode 100644
index 000000000..ac536fbbc
--- /dev/null
+++ b/sktree/causal/_utils.pyx
@@ -0,0 +1,278 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from libc.stdlib cimport free
+from libc.stdlib cimport malloc
+from libc.stdlib cimport calloc
+from libc.stdlib cimport realloc
+from libc.string cimport memcpy
+from libc.math cimport log as ln
+from libc.stdlib cimport abort
+
+from scipy.linalg.cython_lapack cimport dgelsy, dgetrf, dgetri, dgecon, dlacpy, dlange
+
+import numpy as np
+cimport numpy as np
+np.import_array()
+
+from sklearn.tree._utils cimport rand_int
+
+
+rcond_ = np.finfo(np.float64).eps
+cdef inline double RCOND = rcond_
+
+
+# =============================================================================
+# Linear Algebra Functions
+# =============================================================================
+
+
+cpdef bint matinv(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] inv_a) nogil:
+    """ Compute matrix inverse and store it in inv_a.
+    """
+    cdef int m, n
+    m = a.shape[0]
+    if not (m == a.shape[1]):
+        raise ValueError("Can only invert square matrices!")
+    return matinv_(&a[0, 0], &inv_a[0, 0], m)
+
+cdef bint matinv_(DOUBLE_t* a, DOUBLE_t* inv_a, int m) nogil:
+    """ Compute matrix inverse of matrix a of size (m, m) and store it in inv_a.
+    """
+    cdef:
+        int* pivot
+        DOUBLE_t* work
+        int lda, INFO, Lwork
+        bint failed
+
+    lda = m
+    Lwork = m**2
+    pivot = <int*> malloc(m * sizeof(int))
+    work = <DOUBLE_t*> malloc(Lwork * sizeof(DOUBLE_t))
+    failed = False
+    if (pivot==NULL or work==NULL):
+        with gil:
+            raise MemoryError()
+
+    try:
+        memcpy(inv_a, a, m * m * sizeof(DOUBLE_t))
+
+        #Conduct the LU factorization of the array a
+        dgetrf(&m, &m, inv_a, &lda, pivot, &INFO)
+        if not (INFO == 0):
+            failed = True
+        else:
+            #Now use the LU factorization and the pivot information to invert
+            dgetri(&m, inv_a, &lda, pivot, work, &Lwork, &INFO)
+            if not (INFO == 0):
+                failed = True
+    finally:
+        free(pivot)
+        free(work)
+
+    return (not failed)
+
+
+
+cpdef void lstsq(DOUBLE_t[::1, :] a, DOUBLE_t[::1, :] b, DOUBLE_t[::1, :] sol, bint copy_b=True) nogil:
+    """ Compute solution to least squares problem min ||b - a sol||_2^2,
+    where a is a matrix of size (m, n), b is (m, nrhs). Store (n, nrhs) solution in sol.
+    The memory view b, must have at least max(m, n) rows. If m < n, then pad remainder with zeros.
+    If copy_b=True, then b is left unaltered on output. Otherwise b is altered by this call.
+    """
+    cdef int m, n, nrhs
+    m = a.shape[0]
+    n = a.shape[1]
+    nrhs = b.shape[1]
+    ldb = b.shape[0]
+    if ldb < max(m, n):
+        with gil:
+            raise ValueError("Matrix b must have first dimension at least max(a.shape[0], a.shape[1]). "
+                             "Please pad with zeros.")
+    if (sol.shape[0] != n) or (sol.shape[1] != nrhs):
+        with gil:
+            raise ValueError("Matrix sol must have dimensions (a.shape[1], b.shape[1]).")
+    lstsq_(&a[0, 0], &b[0, 0], &sol[0, 0], m, n, ldb, nrhs, copy_b)
+
+
+cdef void lstsq_(DOUBLE_t* a, DOUBLE_t* b, DOUBLE_t* sol, int m, int n, int ldb, int nrhs, bint copy_b=True) nogil:
+    """ Compute solution to least squares problem min ||b - a sol||_2^2,
+    where a is a matrix of size (m, n), b is (m, nrhs). Store (n, nrhs) solution in sol.
+    The leading (row) dimension b, must be at least max(m, n). If m < n, then pad remainder with zeros.
+    If copy_b=True, then b is left unaltered on output. Otherwise b is altered by this call.
+    """
+    cdef:
+        int lda, rank, info, lwork, n_out
+        double rcond
+        Py_ssize_t i, j
+        #array pointers
+        int* jpvt
+        double* work
+        double* b_copy    
+        char* UPLO = 'O' #Any letter other then 'U' or 'L' will copy entire array
+    lda = m
+    if ldb < max(m, n):
+        with gil:
+            raise ValueError("Matrix b must have dimension at least max(a.shape[0], a.shape[1]). "
+                             "Please pad with zeros.")
+    rcond = max(m, n) * RCOND
+    jpvt = <int*> calloc(n, sizeof(int))
+    lwork = max(min(n, m) + 3 * n + 1, 2 * min(n, m) + nrhs)
+    work = <DOUBLE_t*> malloc(lwork * sizeof(DOUBLE_t))
+
+    # TODO. can we avoid all this malloc and copying in our context?
+    a_copy = <DOUBLE_t*> calloc(lda * n, sizeof(DOUBLE_t))
+    if copy_b:
+        b_copy = <DOUBLE_t*> calloc(ldb * nrhs, sizeof(DOUBLE_t))
+    else:
+        b_copy = b
+    try:
+        dlacpy(UPLO, &lda, &n, a, &lda, a_copy, &lda)
+        if copy_b:
+            dlacpy(UPLO, &ldb, &nrhs, b, &ldb, b_copy, &ldb)
+
+        dgelsy(&m, &n, &nrhs, a_copy, &lda, b_copy, &ldb,
+               &jpvt[0], &rcond, &rank, &work[0], &lwork, &info)
+
+        for i in range(n):
+            for j in range(nrhs):
+                sol[i + j * n] = b_copy[i + j * ldb]
+
+    finally:
+        free(jpvt)
+        free(work)
+        free(a_copy)
+        if copy_b:
+            free(b_copy)
+
+cpdef void pinv(DOUBLE_t[::1,:] a, DOUBLE_t[::1, :] sol) nogil:
+    """ Compute pseudo-inverse of (m, n) matrix a and store it in (n, m) matrix sol.
+    Matrix a is left un-altered by this call.
+    """
+    cdef int m = a.shape[0]
+    cdef int n = a.shape[1]
+    pinv_(&a[0, 0], &sol[0, 0], m, n)
+
+cdef void pinv_(DOUBLE_t* a, DOUBLE_t* sol, int m, int n) nogil:
+    """ Compute pseudo-inverse of (m, n) matrix a and store it in (n, m) matrix sol.
+    Matrix a is left un-altered by this call.
+    """
+    # TODO. can we avoid this mallon in our context. Maybe create some fixed memory allocations?
+    cdef int ldb = max(m, n)
+    cdef double* b = <DOUBLE_t*> calloc(ldb * m, sizeof(double))
+    cdef Py_ssize_t i
+    for i in range(m):
+        b[i + i * ldb] = 1.0
+    try:
+        lstsq_(a, b, sol, m, n, ldb, m, copy_b=False)
+
+    finally:
+        free(b)
+
+
+cpdef double fast_max_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil:
+    """ Calculate approximation of maximum eigenvalue via randomized power iteration algorithm.
+    See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf
+    Use reps repetition and random seed based on random_state
+    """
+    return fast_max_eigv_(&A[0, 0], A.shape[0], reps, &random_state)
+
+cdef double fast_max_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil:
+    """ Calculate approximation of maximum eigenvalue via randomized power iteration algorithm.
+    See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf
+    Use reps repetition and random seed based on random_state
+    """
+    cdef int t, i, j
+    cdef double normx, Anormx
+    cdef double* xnew
+    cdef double* xold
+    cdef double* temp
+    xnew = NULL
+    xold = NULL
+
+    try:
+        xnew = <double*> calloc(n, sizeof(double))
+        xold = <double*> calloc(n, sizeof(double))
+
+        if xnew == NULL or xold == NULL:
+            with gil:
+                raise MemoryError()
+        for i in range(n):
+            xold[i] = (1 - 2*rand_int(0, 2, random_state))
+        for t in range(reps):
+            for i in range(n):
+                xnew[i] = 0
+                for j in range(n):
+                    xnew[i] += A[i + j * n] * xold[j]
+            temp = xold
+            xold = xnew
+            xnew = temp
+        normx = 0
+        Anormx = 0
+        for i in range(n):
+            normx += xnew[i] * xnew[i]
+            for j in range(n):
+                Anormx += xnew[i] * A[i + j * n] * xnew[j]
+
+        return Anormx / normx
+    finally:
+        free(xnew)
+        free(xold)
+
+
+cpdef double fast_min_eigv(DOUBLE_t[::1, :] A, int reps, UINT32_t random_state) nogil:
+    """ Calculate approximation of minimum eigenvalue via randomized power iteration algorithm.
+    See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf
+    Use reps repetition and random seed based on random_state
+    """
+    return fast_min_eigv_(&A[0, 0], A.shape[0], reps, &random_state)
+
+cdef double fast_min_eigv_(DOUBLE_t* A, int n, int reps, UINT32_t* random_state) nogil:
+    """ Calculate approximation of minimum eigenvalue via randomized power iteration algorithm.
+    See e.g.: http://theory.stanford.edu/~trevisan/expander-online/lecture03.pdf
+    Use reps repetition and random seed based on random_state.
+    """
+    cdef int t, i, j
+    cdef double normx, Anormx
+    cdef double* xnew
+    cdef double* xold
+    cdef double* temp
+    cdef double* update
+    xnew = NULL
+    xold = NULL
+
+    try:
+        xnew = <double*> calloc(n, sizeof(double))
+        xold = <double*> calloc(n, sizeof(double))
+        update = <double*> calloc(n, sizeof(double))
+
+        if xnew == NULL or xold == NULL or update == NULL:
+            with gil:
+                raise MemoryError()
+        for i in range(n):
+            xold[i] = (1 - 2*rand_int(0, 2, random_state))
+        for t in range(reps):
+            lstsq_(A, xold, update, n, n, n, 1, copy_b=False)
+            for i in range(n):
+                xnew[i] = 0
+                for j in range(n):
+                    xnew[i] += update[i]
+            temp = xold
+            xold = xnew
+            xnew = temp
+        normx = 0
+        Anormx = 0
+        for i in range(n):
+            normx += xnew[i] * xnew[i]
+            for j in range(n):
+                Anormx += xnew[i] * A[i + j * n] * xnew[j]
+
+        return Anormx / normx
+    finally:
+        free(xnew)
+        free(xold)
+        free(update)
diff --git a/sktree/causal/base.py b/sktree/causal/base.py
new file mode 100644
index 000000000..821f6d6e0
--- /dev/null
+++ b/sktree/causal/base.py
@@ -0,0 +1,7 @@
+# XXX: trying to sketch out a base causal API that extends "BaseEstimator" of sklearn
+class BaseCausal:
+    def conditional_effect(self, X=None, *, T0, T1):
+        pass
+
+    def marginal_effect(self, T, X=None):
+        pass
diff --git a/sktree/causal/meson.build b/sktree/causal/meson.build
new file mode 100644
index 000000000..784ca03f3
--- /dev/null
+++ b/sktree/causal/meson.build
@@ -0,0 +1,29 @@
+extensions = [
+  '_grf_criterion',
+  '_splitter',
+  '_utils',
+]
+
+foreach ext: extensions
+  py3.extension_module(ext,
+    cython_gen_cpp.process(ext + '.pyx'),
+    c_args: cython_c_args,
+    include_directories: [incdir_numpy],
+    install: true,
+    subdir: 'sktree/causal',
+  )
+endforeach
+
+python_sources = [
+  '__init__.py',
+  'tree.py',
+  'meta.py',
+]
+
+py3.install_sources(
+  python_sources,
+  subdir: 'sktree/causal'   # Folder relative to site-packages to install to
+)
+
+# TODO: comment in if we include tests
+subdir('tests')
\ No newline at end of file
diff --git a/sktree/causal/meta.py b/sktree/causal/meta.py
new file mode 100644
index 000000000..300dc5d98
--- /dev/null
+++ b/sktree/causal/meta.py
@@ -0,0 +1,399 @@
+import numbers
+from itertools import product
+
+import numpy as np
+from joblib import Parallel, delayed
+from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier
+from sklearn.linear_model import LassoCV, LogisticRegressionCV
+from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, check_cv
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import _check_fit_params, check_is_fitted, indexable
+
+
+def _parallel_crossfit_nuisance(
+    estimator_treatment,
+    estimator_outcome,
+    X,
+    y,
+    t,
+    sample_weight,
+    train,
+    test,
+    verbose,
+    split_progress=None,  # (split_idx, n_splits),
+):
+    """Crossfitting nuisance functions in parallel.
+
+    Parameters
+    ----------
+    estimator_treatment : estimator object implementing 'fit' and 'predict'
+        The object to use to fit the data.
+    estimator_outcome : estimator object implementing 'fit' and 'predict'
+        The object to use to fit the data.
+    X : array-like of shape (n_samples, n_features)
+        The covariate data to fit.
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        The outcome variable.
+    t : array-like of shape (n_samples,) or (n_samples, n_treatment_dims) or None
+        The treatment variable.
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights. If None, then samples are equally weighted. Splits
+        that would create child nodes with net zero or negative weight are
+        ignored while searching for a split in each node. In the case of
+        classification, splits are also ignored if they would result in any
+        single class carrying a negative weight in either child node.
+    train : array-like of shape (n_train_samples,)
+        Indices of training samples.
+    test : array-like of shape (n_test_samples,)
+        Indices of test samples.
+    verbose : int
+        The verbosity level.
+    split_progress : {list, tuple} of int, default=None
+        A list or tuple of format (<current_split_id>, <total_num_of_splits>).
+
+    Returns
+    -------
+    result : dict with the following attributes
+        fit_time : float
+            Time spent for fitting in seconds.
+        score_time : float
+            Time spent for scoring in seconds.
+        estimator_outcome : estimator object
+            The fitted estimator.
+        estimator_treatment : estimator object
+            The fitted estimator.
+        fit_error : str or None
+            Traceback str if the fit failed, None if the fit succeeded.
+        test_idx : array-like of shape (n_test_samples,)
+            The indices of the test samples.
+        y_residuals : array-like of shape (n_test_samples,)
+            The residuals of ``y_test - estimator_outcome.predict(X_test)``.
+        t_residuals : array-like of shape (n_test_samples,)
+            The residuals of ``t_test - estimator_treatment.predict(X_test)``.
+    """
+    progress_msg = ""
+    if verbose > 2:
+        if split_progress is not None:
+            progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
+
+    if verbose > 1:
+        params_msg = ""
+
+    if verbose > 9:
+        start_msg = f"[CV{progress_msg}] START {params_msg}"
+        print(f"{start_msg}{(80 - len(start_msg)) * '.'}")
+
+    # fit the nuisance functions on the
+    X_train, X_test = X[train, :], X[test, :]
+    y_train, y_test = y[train, :], y[test, :]
+    t_train, t_test = t[train, :], t[test, :]
+
+    y_pred = estimator_outcome.fit(X_train, y_train, sample_weight=sample_weight).predict(X_test)
+    t_pred = estimator_treatment.fit(X_train, t_train, sample_weight=sample_weight).predict(X_test)
+
+    # compute the resulting residuals
+    y_residuals = y_test - y_pred
+    t_residuals = t_test - t_pred
+
+    result = dict()
+    result["estimator_outcome"] = estimator_outcome
+    result["estimator_treatment"] = estimator_treatment
+    result["y_residuals"] = y_residuals
+    result["t_residuals"] = t_residuals
+
+    return result
+
+
+def check_outcome_estimator(estimator_outcome, y, random_state):
+    """Check outcome estimator and error-check.
+
+    Parameters
+    ----------
+    estimator_outcome : estimator object
+        The outcome estimator model. If None, then LassoCV will be used.
+    y : array-like of shape (n_samples, n_output)
+        The outcomes array.
+    random_state : int
+        Random seed.
+
+    Returns
+    -------
+    estimator_outcome : estimator object
+        The cloned outcome estimator model.
+    """
+    if estimator_outcome is None:
+        return LassoCV(random_state=random_state)
+    else:
+        if not hasattr(estimator_outcome, "fit") or not hasattr(estimator_outcome, "predict"):
+            raise RuntimeError(
+                "All outcome estimator models must have the `fit` and `predict` functions implemented."
+            )
+
+        estimator_outcome = clone(estimator_outcome)
+    # XXX: run some checks on y being compatible with the estimator
+    if is_classifier(estimator_outcome) and not issubclass(y.type, numbers.Integral):
+        raise RuntimeError(
+            "Treatment array is not integers, but the treatment estimator is classification based. "
+            "If treatment array is discrete, then treatment estimator should be a classifier. "
+            "If treatment array is continuous, thent treatment estimator should be a regressor."
+        )
+    return estimator_outcome
+
+
+def check_treatment_estimator(estimator_treatment, t, random_state):
+    """Check treatment estimator and error-check.
+
+    Parameters
+    ----------
+    estimator_treatment : estimator object
+        The treatment estimator model. If None, then LogisticRegressionCV will be used.
+    t : array-like of shape (n_samples, n_treatment_dims)
+        The treatment array.
+    random_state : int
+        Random seed.
+
+    Returns
+    -------
+    estimator_treatment : estimator object
+        The cloned treatment estimator model.
+    """
+    if estimator_treatment is None:
+        estimator_treatment = LogisticRegressionCV(random_state=random_state)
+    else:
+        if not hasattr(estimator_treatment, "fit") or not hasattr(estimator_treatment, "predict"):
+            raise RuntimeError(
+                "All treatment estimator models must have the `fit` and `predict` functions implemented."
+            )
+
+        estimator_treatment = clone(estimator_treatment)
+
+    # XXX: run some checks on t being compatible with the estimator. i.e. discrete -> classifier, otw regressor
+    if is_classifier(estimator_treatment) and not issubclass(t.type, numbers.Integral):
+        raise RuntimeError(
+            "Treatment array is not integers, but the treatment estimator is classification based. "
+            "If treatment array is discrete, then treatment estimator should be a classifier. "
+            "If treatment array is continuous, thent treatment estimator should be a regressor."
+        )
+
+    return estimator_treatment
+
+
+class DML(MetaEstimatorMixin, BaseEstimator):
+    """A meta-estimator for performining double machine-learning.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        This is assumed to implement the scikit-learn estimator interface.
+        Estimator needs to provide a ``fit``, and ``predict_prob`` function.
+    estimator_outcome : estimator object, optional
+        The estimator model for the outcome, by default :class:`sklearn.linear_model.LassoCV`.
+        This is assumed to implement the scikit-learn estimator interface. Estimator needs
+        to provide a ``fit``, and ``predict_prob`` function. For more details, see Notes.
+    estimator_treatment : estimator object, optional
+        The estimator model for the treatment, by default :class:`sklearn.linear_model.LogisticRegressionCV`.
+        This is assumed to implement the scikit-learn estimator interface. Estimator needs to
+        provide a ``fit``, and ``predict_prob`` function. For more details, see Notes.
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for random uniform sampling
+        from lists of possible values instead of scipy.stats distributions.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the different training and test sets.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+    verbose : int, default=0
+        Controls the verbosity: the higher, the more messages.
+
+    Attributes
+    ----------
+    estimator_outcome_ : list of estimator object
+        Fitted nuisance estimators of the outcome.
+    estimator_treatment_ : list of estimator object
+        Fitted nuisance estimators of the treatment.
+    estimator_final_ : estimator object
+        Fitted final-stage estimator.
+    
+    Notes
+    -----
+    The CATE is modeled using the following equation:
+
+    .. math ::
+        Y - \\E[Y | X, W] = \\Theta(X) \\cdot (T - \\E[T | X, W]) + \\epsilon
+
+    where ``Y`` are the outcomes, ``X`` are the observed covariates that confound treatment, ``T``
+    and outcomes. ``W`` are observed covariates that act as controls. :math:`\\epsilon`` is the noise
+    term of the data-generating model. The CATE is defined as :math:`Theta(X)`. 
+
+    Double machine learning (DML) follows a two stage process, where a set of nuisance functions
+    are estimated in the first stage in a crossfitting manner and a final stage estimates the CATE
+    model :footcite:`chernozhukov2018double`. See the User-guide for a description of two-stage learners, specifically on orthogonal
+    learning and doubly-robust learning. The two-stage procedure of DML follows the two steps:
+
+    **1. Estimate nuisance functions**
+    There are two nuisance functions that need to be estimated, :math:`q(X, W) = E[Y | X, W]` and
+    :math:`f(X, W) = E[T | X, W]`, which are the models for the outcome and treatment
+    respectively. Here, one needs to run a regression (or classification model if treatments/outcomes
+    are discrete), that predicts outcomes and treatments given the covariates and control features.
+
+    **2. Final stage regression of residuals on residuals**
+    Once :math:`q(X, W)` and :math:`f(X, W)` are estimated in the first stage, we compute
+    the residuals of the models with their observed outcome and treatment samples.
+
+    .. math ::
+        \\tilde{Y} = Y - q(X, W)
+        \\tilde{T} = T - f(X, W)
+
+    and then we fit a final stage model that takes the covariates, :math:`X` and :math:`\\tilde{T}`. That is
+    it uses the residuals of the treatment model and the covariates to predict the residuals of the outcome.
+    The fitted model represents the CATE.
+
+    .. math ::
+        \\hat{\\theta} = \\arg\\min_{\\Theta}\
+        \\E_n\\left[ (\\tilde{Y} - \\Theta(X) \\cdot \\tilde{T})^2 \\right]
+
+    In the DML process, ``T`` and ``Y`` can be either discrete, or continuous and the resulting process will
+    be valid. For each stage of the DML process, one can use arbitrary sklearn-like estimators.
+
+    **Estimation procedure with cross-fitting and cross-validation**
+
+    In order to prevent overfitting to training data, the nuisance functions are estimated using training data
+    and the nuisance values (i.e. the residuals) are computed using the testing data. This is done on K-folds of
+    data, so the final residuals are computed from multiple model instantiations. If there is no cross-fitting,
+    then the nuisance functions are estimated and evaluated on the entire dataset. If there is at least K-fold
+    cross-fitting, then the data is split into K distinct groups and for each group, the model is trained on
+    the rest of the data, and evaluated on the held-out test group.
+
+    The final model then takes as input the residuals of outcome, residuals of treatment and covariates to
+    estimate the CATE.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+
+    def __init__(
+        self,
+        estimator,
+        estimator_outcome=None,
+        estimator_treatment=None,
+        cv=None,
+        random_state=None,
+        n_jobs=None,
+        verbose=False,
+    ) -> None:
+        super().__init__()
+        self.estimator = estimator
+        self.estimator_outcome = estimator_outcome
+        self.estimator_treatment = estimator_treatment
+        self.cv = cv
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.verbose = verbose
+
+    def fit(self, X, y, *, t=None, groups=None, **fit_params):
+        self._validate_params()
+        random_state = check_random_state(self.random_state)
+
+        # run checks on input data
+        X, y = self._validate_data(X, y, multi_output=True, accept_sparse=False)
+        self._validate_data(y=t, multi_output=True, accept_sparse=False)
+        X, y, t, groups = indexable(X, y, t, groups)
+        fit_params = _check_fit_params(X, fit_params)
+        sample_weight = fit_params.get("sample_weight", None)
+
+        # Determine output settings
+        _, self.n_features_in_ = X.shape
+
+        # get cross-validation object
+        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
+        n_splits = cv.get_n_splits(X, y, groups)
+
+        # XXX: this should not be required. If the user wants, they can pass in
+        # a CV object
+        if cv != self.cv and isinstance(cv, (KFold, StratifiedKFold)):
+            cv.shuffle = True
+            cv.random_state = random_state
+
+        final_stage_estimator = clone(self.estimator)
+
+        # initialize the models for treatment and outcome
+        estimator_outcome = check_outcome_estimator(
+            estimator_outcome=self.estimator_outcome, y=y, random_state=random_state
+        )
+        estimator_treatment = check_treatment_estimator(
+            estimator_treatment=self.estimator_treatment, t=t, random_state=random_state
+        )
+
+        # fit the nuisance functions in parallel
+        parallel = Parallel(n_jobs=self.n_jobs)
+        with parallel:
+            out = parallel(
+                delayed(_parallel_crossfit_nuisance)(
+                    estimator_treatment,
+                    estimator_outcome,
+                    final_stage_estimator,
+                    X=X,
+                    y=y,
+                    t=t,
+                    sample_weight=sample_weight,
+                    train=train,
+                    test=test,
+                    verbose=self.verbose,
+                    split_progress=(split_idx, n_splits),
+                )
+                for (split_idx, (train, test)) in enumerate(cv.split(X, y, groups))
+            )
+
+            y_residuals = np.zeros(y.shape)
+            t_residuals = np.zeros(t.shape)
+
+            for result in out:
+                test = result["test_idx"]
+                y_residuals[test, :] = result["y_residuals"]
+                t_residuals[test, :] = result["t_residuals"]
+
+        # now fit the final stage estimator
+        final_stage_estimator.fit(X=X, y=y_residuals, t=t_residuals, **fit_params)
+
+    def predict(self, X):
+        """Predict the conditional average treatment effect given covariates X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data.
+        """
+        pass
+
+    def conditional_effect(self, X, t=None):
+        """Compute conditional effect of
+
+        Parameters
+        ----------
+        X : _type_
+            _description_
+        t : _type_, optional
+            _description_, by default None
+        """
+        pass
diff --git a/sktree/causal/tests/test_causal.py b/sktree/causal/tests/test_causal.py
new file mode 100644
index 000000000..06a83b2f6
--- /dev/null
+++ b/sktree/causal/tests/test_causal.py
@@ -0,0 +1,20 @@
+import joblib
+import numpy as np
+import pytest
+from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_array_equal
+from sklearn import datasets
+from sklearn.utils.estimator_checks import parametrize_with_checks
+
+from sktree.causal import CausalTree
+
+
+@parametrize_with_checks(
+    [
+        CausalTree(random_state=12),
+    ]
+)
+def test_sklearn_compatible_estimator(estimator, check):
+    # TODO: remove when we implement Regressor classes
+    if check.func.__name__ in ["check_requires_y_none"]:
+        pytest.skip()
+    check(estimator)
diff --git a/sktree/causal/tests/test_utils.py b/sktree/causal/tests/test_utils.py
new file mode 100644
index 000000000..b886f3bad
--- /dev/null
+++ b/sktree/causal/tests/test_utils.py
@@ -0,0 +1,46 @@
+import numpy as np 
+
+from sktree.causal._utils import matinv, lstsq, pinv, fast_max_eigv, fast_min_eigv
+
+
+def test_fast_eigv(self):
+    rng = np.random.default_rng(123)
+
+    n = 4
+    for _ in range(10):
+        A = rng.normal(0, 1, size=(n, n))
+        A = np.asfortranarray(A @ A.T)
+        apx = fast_min_eigv(A, 5, 123)
+        opt = np.min(np.linalg.eig(A)[0])
+        np.testing.assert_allclose(apx, opt, atol=.01, rtol=.3)
+        apx = fast_max_eigv(A, 10, 123)
+        opt = np.max(np.linalg.eig(A)[0])
+        np.testing.assert_allclose(apx, opt, atol=.5, rtol=.2)
+
+
+def test_linalg():
+    rng = np.random.default_rng(1234)
+
+    for n, m, nrhs in [(3, 3, 3), (3, 2, 1), (3, 1, 2), (1, 4, 2), (3, 4, 5)]:
+        for _ in range(100):
+            A = rng.normal(0, 1, size=(n, m))
+            y = rng.normal(0, 1, size=(n, nrhs))
+            yf = y
+            if m > n:
+                yf = np.zeros((m, nrhs))
+                yf[:n] = y
+            ours = np.asfortranarray(np.zeros((m, nrhs)))
+            lstsq(np.asfortranarray(A), np.asfortranarray(yf.copy()), ours, copy_b=True)
+            true = np.linalg.lstsq(A, y, rcond=np.finfo(np.float64).eps * max(n, m))[0]
+            np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0)
+
+            ours = np.asfortranarray(np.zeros(A.T.shape, dtype=np.float64))
+            pinv(np.asfortranarray(A), ours)
+            true = np.linalg.pinv(A)
+            np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0)
+
+            if n == m:
+                ours = np.asfortranarray(np.zeros(A.T.shape, dtype=np.float64))
+                matinv(np.asfortranarray(A), ours)
+                true = np.linalg.inv(A)
+                np.testing.assert_allclose(ours, true, atol=.00001, rtol=.0)
diff --git a/sktree/causal/tree.py b/sktree/causal/tree.py
new file mode 100644
index 000000000..f629523e9
--- /dev/null
+++ b/sktree/causal/tree.py
@@ -0,0 +1,391 @@
+from sklearn.base import BaseEstimator
+from sklearn.ensemble._forest import BaseForest, ForestRegressor
+from sklearn.tree import BaseDecisionTree, DecisionTreeClassifier, DecisionTreeRegressor
+
+
+class CausalTree(BaseEstimator):
+    def __init__(
+        self,
+        tree=None,
+        criterion="gini",
+        splitter="best",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=None,
+        random_state=None,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        class_weight=None,
+        ccp_alpha=0.0,
+        honest_ratio=0.5,
+        **tree_params,
+    ) -> None:
+        """A causal tree.
+
+        A causal tree is a decision tree model that operates over not only
+        ``(X, y)`` tuple of arrays, but also includes the treatment groups.
+        It is an estimator model that has a graphical model of the form
+        :math:`T \\rightarrow Y` with :math:`T \\leftarrow X \\rightarrow Y`.
+
+        Parameters
+        ----------
+        tree : _type_, optional
+            _description_, by default None
+        criterion : str, optional
+            _description_, by default "gini"
+        splitter : str, optional
+            _description_, by default "best"
+        max_depth : _type_, optional
+            _description_, by default None
+        min_samples_split : int, optional
+            _description_, by default 2
+        min_samples_leaf : int, optional
+            _description_, by default 1
+        min_weight_fraction_leaf : float, optional
+            _description_, by default 0.0
+        max_features : _type_, optional
+            _description_, by default None
+        random_state : _type_, optional
+            _description_, by default None
+        max_leaf_nodes : _type_, optional
+            _description_, by default None
+        min_impurity_decrease : float, optional
+            _description_, by default 0.0
+        class_weight : _type_, optional
+            _description_, by default None
+        ccp_alpha : float, optional
+            _description_, by default 0.0
+        honest_ratio : float, optional
+            _description_, by default 0.5
+
+        Notes
+        -----
+        Compared to a normal decision tree model, which estimates the value :math:`E[Y | X]`,
+        a causal decision tree splits the data in such a way to estimate the value
+        :math:`E[Y | do(T), X]`. This is a causal effect, and thus requires causal assumptions.
+        The core assumption is that of unconfoundedness, where the outcomes, ``Y``, and the
+        treatments, ``T`` are not confounded by any other non-observed variables. That is ``X``
+        captures all possible confounders. This is typically a strong assumption, but is realized
+        in well-designed observational studies and randomized control trials, where the treatment
+        assignment is randomized.
+        """
+        self.tree = tree
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.random_state = random_state
+        self.min_impurity_decrease = min_impurity_decrease
+        self.class_weight = class_weight
+        self.ccp_alpha = ccp_alpha
+        self.honest_ratio = honest_ratio
+        self.tree_params = tree_params
+
+    def fit(self, X, y, sample_weight=None, T=None):
+        # initialize the tree estimator
+        if self.tree is None:
+            self.estimator_ = DecisionTreeClassifier(
+                criterion=self.criterion,
+                splitter=self.splitter,
+                max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                max_features=self.max_features,
+                random_state=self.random_state,
+                max_leaf_nodes=self.max_leaf_nodes,
+                min_impurity_decrease=self.min_impurity_decrease,
+                class_weight=self.class_weight,
+                ccp_alpha=self.ccp_alpha,
+                honest_ratio=1.0,
+                **self.tree_params,
+            )
+        else:
+            if not issubclass(self.tree, BaseDecisionTree):
+                raise RuntimeError(
+                    "The type of tree must be a subclass of sklearn.tree.BaseDecisionTree."
+                )
+
+            self.estimator_ = self.tree(
+                criterion=self.criterion,
+                splitter=self.splitter,
+                max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                max_features=self.max_features,
+                random_state=self.random_state,
+                max_leaf_nodes=self.max_leaf_nodes,
+                min_impurity_decrease=self.min_impurity_decrease,
+                class_weight=self.class_weight,
+                ccp_alpha=self.ccp_alpha,
+                honest_ratio=1.0,
+                **self.tree_params,
+            )
+
+        # fit the decision tree using only the treatment groups
+        self.estimator_.fit(X, y=T, sample_weight=sample_weight)
+
+        # re-fit the tree leaves using the outcomes
+        refit_leaves(self.estimator_, X, y)
+
+    def apply(self, X):
+        return self.estimator_.apply(X)
+
+    def predict(self, X):
+        return self.estimator_.predict(X)
+
+    def predict_proba(self, X):
+        return self.estimator_.predict_proba(X)
+
+    def decision_path(self, X):
+        return self.estimator_.decision_path(X)
+
+    @property
+    def feature_importances_(self):
+        """Return the feature importances.
+
+        The importance of a feature is computed as the (normalized) total
+        reduction of the criterion brought by that feature.
+        It is also known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+        Returns
+        -------
+        feature_importances_ : ndarray of shape (n_features,)
+            Normalized total reduction of criteria by feature
+            (Gini importance).
+        """
+        return self.estimator_.feature_importances_
+
+
+class ForestCausalRegressor(BaseForest):
+    @abstractmethod
+    def __init__(
+        self,
+        estimator,
+        n_estimators=100,
+        *,
+        estimator_params=tuple(),
+        bootstrap=False,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        max_samples=None,
+        base_estimator="deprecated",
+    ):
+        super().__init__(
+            estimator,
+            n_estimators=n_estimators,
+            estimator_params=estimator_params,
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            max_samples=max_samples,
+            base_estimator=base_estimator,
+        )
+
+    def predict(self, X):
+        """
+        Predict regression target for X.
+
+        The predicted regression target of an input sample is computed as the
+        mean predicted regression targets of the trees in the forest.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        if self.n_outputs_ > 1:
+            y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)
+        else:
+            y_hat = np.zeros((X.shape[0]), dtype=np.float64)
+
+        # Parallel loop
+        lock = threading.Lock()
+        Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
+            delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) for e in self.estimators_
+        )
+
+        y_hat /= len(self.estimators_)
+
+        return y_hat
+
+    @staticmethod
+    def _get_oob_predictions(tree, X):
+        """Compute the OOB predictions for an individual tree.
+
+        Parameters
+        ----------
+        tree : DecisionTreeRegressor object
+            A single decision tree regressor.
+        X : ndarray of shape (n_samples, n_features)
+            The OOB samples.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples, 1, n_outputs)
+            The OOB associated predictions.
+        """
+        y_pred = tree.predict(X, check_input=False)
+        if y_pred.ndim == 1:
+            # single output regression
+            y_pred = y_pred[:, np.newaxis, np.newaxis]
+        else:
+            # multioutput regression
+            y_pred = y_pred[:, np.newaxis, :]
+        return y_pred
+
+    def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
+        """Compute and set the OOB score and attributes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+        scoring_function : callable, default=None
+            Scoring function for OOB score. Defaults to `r2_score`.
+        """
+        self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1)
+        if self.oob_prediction_.shape[-1] == 1:
+            # drop the n_outputs axis if there is a single output
+            self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1)
+
+        if scoring_function is None:
+            scoring_function = r2_score
+
+        self.oob_score_ = scoring_function(y, self.oob_prediction_)
+
+    def _compute_partial_dependence_recursion(self, grid, target_features):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray of shape (n_samples, n_target_features)
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray of shape (n_target_features)
+            The set of target features for which the partial dependence
+            should be evaluated.
+
+        Returns
+        -------
+        averaged_predictions : ndarray of shape (n_samples,)
+            The value of the partial dependence function on each grid point.
+        """
+        grid = np.asarray(grid, dtype=DTYPE, order="C")
+        averaged_predictions = np.zeros(shape=grid.shape[0], dtype=np.float64, order="C")
+
+        for tree in self.estimators_:
+            # Note: we don't sum in parallel because the GIL isn't released in
+            # the fast method.
+            tree.tree_.compute_partial_dependence(grid, target_features, averaged_predictions)
+        # Average over the forest
+        averaged_predictions /= len(self.estimators_)
+
+        return averaged_predictions
+
+    def _more_tags(self):
+        return {"multilabel": True}
+
+
+# XXX: figure out how to make this either:
+# - DML tree
+# - DR tree
+# - OrthoDML/DR tree
+class CausalForest(ForestRegressor):
+    _parameter_constraints: dict = {
+        **ForestRegressor._parameter_constraints,
+        **DecisionTreeRegressor._parameter_constraints,
+    }
+    _parameter_constraints.pop("splitter")
+
+    def __init__(
+        self,
+        n_estimators=100,
+        *,
+        criterion="squared_error",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=1.0,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=True,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        ccp_alpha=0.0,
+        max_samples=None,
+        honest_ratio=0.5,
+    ):
+        super().__init__(
+            estimator=CausalTree(),
+            n_estimators=n_estimators,
+            estimator_params=(
+                "criterion",
+                "max_depth",
+                "min_samples_split",
+                "min_samples_leaf",
+                "min_weight_fraction_leaf",
+                "max_features",
+                "max_leaf_nodes",
+                "min_impurity_decrease",
+                "random_state",
+                "ccp_alpha",
+                "honest_ratio",
+            ),
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            max_samples=max_samples,
+        )
+
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.ccp_alpha = ccp_alpha
+        self.honest_ratio = honest_ratio
diff --git a/sktree/meson.build b/sktree/meson.build
index 974e52d1f..de0076761 100644
--- a/sktree/meson.build
+++ b/sktree/meson.build
@@ -39,16 +39,12 @@ incdir_numpy = run_command(py3,
 ).stdout().strip()
 # inc_np = include_directories(incdir_numpy)
 
+# only use non-deprecated Numpy API
+numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION'
+cython_c_args = '-NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION'
+
 cc = meson.get_compiler('c')
 
-# Don't use the deprecated NumPy C API. Define this to a fixed version instead of
-# NPY_API_VERSION in order not to break compilation for released versions
-# when NumPy introduces a new deprecation. Use in a meson.build file::
-#
-# py3.extension_module('_name',
-#   'source_fname',
-#   numpy_nodepr_api)
-numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_9_API_VERSION'
 
 python_sources = [
   '__init__.py',
@@ -77,6 +73,7 @@ c_undefined_ok = ['-Wno-maybe-uninitialized']
 # cython_c_args += ['-Wno-cpp']
 # cython_cpp_args = cython_c_args
 
+subdir('causal')
 subdir('tree')
 subdir('ensemble')
 subdir('tests')
\ No newline at end of file
diff --git a/sktree/tree/meson.build b/sktree/tree/meson.build
index 6d2e330e2..4f9228a1c 100644
--- a/sktree/tree/meson.build
+++ b/sktree/tree/meson.build
@@ -15,13 +15,11 @@ foreach ext: extensions
     cython_gen_cpp.process(ext + '.pyx'),
     c_args: cython_c_args,
     include_directories: [incdir_numpy],
-    # override_options : ['cython_language=cpp'],
     install: true,
     subdir: 'sktree/tree',
   )
 endforeach
 
-# TODO: comment in _classes.py when we have a working Cython unsupervised tree with a Python API
 python_sources = [
   '__init__.py',
   '_classes.py',