From 73e8173e3bce9cf3ee724fde76039e2ea2604078 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 23 Jul 2024 12:07:23 -0400 Subject: [PATCH 1/7] Initial commit Signed-off-by: Adam Li --- examples/manifold/README.txt | 6 + .../manifold/plot_kernel_decision_tree.py | 106 ++++ treeple/tree/__init__.py | 2 + treeple/tree/_classes.py | 497 ++++++++++++++++++ treeple/tree/manifold/_morf_splitter.pxd | 11 +- treeple/tree/manifold/_morf_splitter.pyx | 109 ++++ 6 files changed, 728 insertions(+), 3 deletions(-) create mode 100644 examples/manifold/README.txt create mode 100644 examples/manifold/plot_kernel_decision_tree.py diff --git a/examples/manifold/README.txt b/examples/manifold/README.txt new file mode 100644 index 000000000..8b1808911 --- /dev/null +++ b/examples/manifold/README.txt @@ -0,0 +1,6 @@ +.. _manifold_examples: + +Manifold learning with Decision-trees +------------------------------------- + +Examples demonstrating manifold learning using random forest variants. diff --git a/examples/manifold/plot_kernel_decision_tree.py b/examples/manifold/plot_kernel_decision_tree.py new file mode 100644 index 000000000..954aef7be --- /dev/null +++ b/examples/manifold/plot_kernel_decision_tree.py @@ -0,0 +1,106 @@ +""" +====================================== +Custom Kernel Decision Tree Classifier +====================================== + +This example shows how to build a manifold oblique decision tree classifier using +a custom set of user-defined kernel/filter library, such as the Gaussian, or Gabor +kernels. + +The example demonstrates superior performance on a 2D dataset with structured images +as samples. The dataset is the downsampled MNIST dataset, where each sample is a +28x28 image. The dataset is downsampled to 14x14, and then flattened to a 196 +dimensional vector. The dataset is then split into a training and testing set. + +See :ref:`sphx_glr_auto_examples_plot_projection_matrices` for more information on +projection matrices and the way they can be sampled. +""" +import matplotlib.pyplot as plt + +# %% +# Importing the necessary modules +import numpy as np +from sklearn.datasets import fetch_openml +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +from treeple.tree import KernelDecisionTreeClassifier + +# %% +# Load the Dataset +# ---------------- +# We need to load the dataset and split it into training and testing sets. + +# Load the dataset +X, y = fetch_openml("mnist_784", version=1, return_X_y=True) + +# Downsample the dataset +X = X.reshape((-1, 28, 28)) +X = X[:, ::2, ::2] +X = X.reshape((-1, 196)) + +# Split the dataset into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# %% +# Setting up the Custom Kernel Decision Tree Model +# ------------------------------------------------- +# To set up the custom kernel decision tree model, we need to define typical hyperparameters +# for the decision tree classifier, such as the maximum depth of the tree and the minimum +# number of samples required to split an internal node. For the Kernel Decision tree model, +# we also need to define the kernel function and its parameters. + +max_depth = 10 +min_samples_split = 2 + +# Next, we define the hyperparameters for the custom kernels that we will use. +# For example, if we want to use a Gaussian kernel with a sigma of 1.0 and a size of 3x3: +kernel_function = "gaussian" +kernel_params = {"sigma": 1.0, "size": (3, 3)} + +# We can then fit the custom kernel decision tree model to the training set: +clf = KernelDecisionTreeClassifier( + max_depth=max_depth, + min_samples_split=min_samples_split, + data_dims=(28, 28), + min_patch_dims=(1, 1), + max_patch_dims=(14, 14), + dim_contiguous=(True, True), + boundary=None, + n_classes=10, + kernel_function=kernel_function, + n_kernels=500, + store_kernel_library=True, +) + +# Fit the decision tree classifier using the custom kernel +clf.fit(X_train, y_train) + +# %% +# Evaluating the Custom Kernel Decision Tree Model +# ------------------------------------------------ +# To evaluate the custom kernel decision tree model, we can use the testing set. +# We can also inspect the important kernels that the tree selected. + +# Predict the labels for the testing set +y_pred = clf.predict(X_test) + +# Compute the accuracy score +accuracy = accuracy_score(y_test, y_pred) + +print(f"Kernel decision tree model obtained an accuracy of {accuracy} on MNIST.") + +# Get the important kernels from the decision tree classifier +important_kernels = clf.kernel_arr_ +kernel_dims = clf.kernel_dims_ +kernel_params = clf.kernel_params_ +kernel_library = clf.kernel_library_ + +# Plot the important kernels +fig, axes = plt.subplots( + nrows=len(important_kernels), ncols=1, figsize=(6, 4 * len(important_kernels)) +) +for i, kernel in enumerate(important_kernels): + axes[i].imshow(kernel, cmap="gray") + axes[i].set_title("Kernel {}".format(i + 1)) +plt.show() \ No newline at end of file diff --git a/treeple/tree/__init__.py b/treeple/tree/__init__.py index 797338ac3..d6a4beab7 100644 --- a/treeple/tree/__init__.py +++ b/treeple/tree/__init__.py @@ -7,6 +7,7 @@ from ._classes import ( ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor, + KernelDecisionTreeClassifier, ObliqueDecisionTreeClassifier, ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeClassifier, @@ -34,4 +35,5 @@ "ExtraTreeClassifier", "ExtraTreeRegressor", "MultiViewDecisionTreeClassifier", + "KernelDecisionTreeClassifier", ] diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py index 16eb6ea52..76a9052a2 100644 --- a/treeple/tree/_classes.py +++ b/treeple/tree/_classes.py @@ -23,6 +23,7 @@ from ._neighbors import SimMatrixMixin from ._oblique_splitter import ObliqueSplitter from ._oblique_tree import ObliqueTree +from .kernels import gaussian_kernel from .manifold import _morf_splitter from .manifold._morf_splitter import PatchSplitter from .unsupervised import _unsup_criterion, _unsup_oblique_splitter, _unsup_splitter @@ -3237,3 +3238,499 @@ def _build_tree( builder.build(self.tree_, X, y, sample_weight) return self + + +class KernelDecisionTreeClassifier(PatchObliqueDecisionTreeClassifier): + """Oblique decision tree classifier over data patches combined with Gaussian kernels. + Parameters + ---------- + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + splitter : {"best", "random"}, default="best" + The strategy used to choose the split at each node. Supported + strategies are "best" to choose the best split and "random" to choose + the best random split. + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + max_features : int, float or {"auto", "sqrt", "log2"}, default=None + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `int(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + Note: Compared to axis-aligned Random Forests, one can set + max_features to a number greater then ``n_features``. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + The weighted impurity decrease equation is the following:: + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + class_weight : dict, list of dict or "balanced", default=None + Weights associated with classes in the form ``{class_label: weight}``. + If None, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + For multi-output, the weights of each column of y will be multiplied. + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + min_patch_dim : array-like, optional + The minimum dimensions of a patch, by default 1 along all dimensions. + max_patch_dim : array-like, optional + The maximum dimensions of a patch, by default 1 along all dimensions. + dim_contiguous : array-like of bool, optional + Whether or not each patch is sampled contiguously along this dimension. + data_dims : array-like, optional + The presumed dimensions of the un-vectorized feature vector, by default + will be a 1D vector with (1, n_features) shape. + boundary : optional, str {'wrap'} + The boundary condition to use when sampling patches, by default None. + 'wrap' corresponds to the boundary condition as is in numpy and scipy. + feature_weight : array-like of shape (n_samples,n_features,), default=None + Feature weights. If None, then features are equally weighted as is. + If provided, then the feature weights are used to weight the + patches that are generated. The feature weights are used + as follows: for every patch that is sampled, the feature weights over + the entire patch is summed and normalizes the patch. + kernel : str {'gaussian', 'uniform'}, default='gaussian' + The kernel to use. + n_kernels : int, optional + The number of different kernels to generate. This number should be very high + as this generates kernels + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) or list of ndarray + The classes labels (single output problem), + or a list of arrays of class labels (multi-output problem). + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance [4]_. + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + max_features_ : int + The inferred value of max_features. + n_classes_ : int or list of int + The number of classes (for single output problems), + or a list containing the number of classes for each + output (for multi-output problems). + n_features_in_ : int + Number of features seen during :term:`fit`. + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + n_outputs_ : int + The number of outputs when ``fit`` is performed. + tree_ : Tree instance + The underlying Tree object. Please refer to + ``help(sklearn.tree._tree.Tree)`` for + attributes of Tree object. + min_patch_dims_ : array-like + The minimum dimensions of a patch. + max_patch_dims_ : array-like + The maximum dimensions of a patch. + data_dims_ : array-like + The presumed dimensions of the un-vectorized feature vector. + kernel_arr_ : list of length (n_nodes,) of array-like of shape (patch_dims,) + The kernel array that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + kernel_params_ : list of length (n_nodes,) + The parameters of the kernel that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + kernel_library_ : array-like of shape (n_kernels,), optional + The library of kernels that was chosen from the patches for this tree. + Only stored if ``store_kernel_library`` is set to True. + kernel_library_params_ : list of length (n_nodes,), optional + The parameters of the kernels that was chosen from the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + Only stored if ``store_kernel_library`` is set to True. + References + ---------- + .. footbibliography:: + """ + + _parameter_constraints = { + **DecisionTreeClassifier._parameter_constraints, + "min_patch_dims": ["array-like", None], + "max_patch_dims": ["array-like", None], + "data_dims": ["array-like", None], + "dim_contiguous": ["array-like", None], + "boundary": [str, None], + "feature_weight": ["array-like", None], + "kernel": ["str"], + "n_kernels": [int, None], + "store_kernel_library": [bool], + } + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + kernel="gaussian", + n_kernels=None, + store_kernel_library=False, + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, + random_state=random_state, + min_impurity_decrease=min_impurity_decrease, + ) + + self.min_patch_dims = min_patch_dims + self.max_patch_dims = max_patch_dims + self.dim_contiguous = dim_contiguous + self.data_dims = data_dims + self.boundary = boundary + self.feature_weight = feature_weight + + self.kernel = kernel + self.n_kernel = n_kernels + self.store_kernel_library = store_kernel_library + + def _build_tree( + self, + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + min_samples_leaf : int or float + The minimum number of samples required to be at a leaf node. + min_weight_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights. + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. + """ + # Build tree + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + splitter = self.splitter + if issparse(X): + raise ValueError( + "Sparse input is not supported for oblique trees. " + "Please convert your data to a dense array." + ) + else: + PATCH_SPLITTERS = PATCH_DENSE_SPLITTERS + + # compute user-defined Kernel library before + kernel_library, kernel_dims, kernel_params = self._sample_kernel_library(X, y) + + # Note: users cannot define a splitter themselves + splitter = PATCH_SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + self.min_patch_dims_, + self.max_patch_dims_, + self.dim_contiguous_, + self.data_dims_, + self.boundary, + self.feature_weight, + kernel_library, + ) + + self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + ) + + builder.build(self.tree_, X, y, sample_weight) + + # Set some attributes based on what kernel indices were used in each tree + # - construct a tree-nodes like array and set it as a Python attribute, so it is + # exposed to the Python interface + # - use the Cython tree.feature array to store the index of the dictionary library + # that was used to split at each node + kernel_idx_chosen = self.tree_.feature + kernel_lib_chosen = kernel_library[kernel_idx_chosen] + kernel_params_chosen = kernel_params[kernel_idx_chosen] + self.kernel_arr_ = kernel_lib_chosen + self.kernel_dims_ = kernel_dims[kernel_idx_chosen] + self.kernel_params_ = kernel_params_chosen + + if self.store_kernel_library: + self.kernel_library_ = kernel_library + self.kernel_idx_chosen_ = kernel_idx_chosen + + if self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples the dictionary library that is sampled from in the random forest. + A kernel can either be applied with the boundaries of the image in mind, such that + the patch can be uniformly applied across all indices of rows of ``X_structured``. This + is equivalent to passing in ``boundary = 'wrap'``. Alternatively, a kernel can be sampled, + such that the patch is always contained within the boundary of the image. This is equivalent + to passing in ``boundary = None``. + Parameters + ---------- + X : _type_ + _description_ + y : _type_ + _description_ + sample_weight : _type_ + _description_ + Returns + ------- + kernel_library : array-like of shape (n_kernels, n_features) + The dictionary that will be sampled from the random forest. Non-zero entries indicate + where the kernel is applied. This is a n-dimensional kernel matrix that is vectorized + and placed into the original ``X`` data dimensions of (n_dims,). + kernel_dims : list of (n_kernels,) length + The dimensions of each kernel that is sampled. For example, (n_dims_x, n_dims_y) in + the first element indicates that the first kernel has a shape of (n_dims_x, n_dims_y). + This can be used to then place where the top-left seed of each kernel is applied. + kernel_params : list of (n_kernels,) length + A list of dictionaries representing the parameters of each kernel. This is used to + keep track of what kernels and parameterizations were valuable when used in the random + forest. + """ + raise NotImplementedError("This method should be implemented in a child class.") + + +class GaussianKernelDecisionTreeClassifier(KernelDecisionTreeClassifier): + _parameter_constraints = { + **KernelDecisionTreeClassifier._parameter_constraints, + # "mu_bounds": ['array-like'], + # "sigma_bounds": ['array-like'], + } + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + n_kernels=None, + store_kernel_library=False, + mu_bounds=(0, 1), + sigma_bounds=(0, 1), + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + random_state=random_state, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + class_weight=class_weight, + min_patch_dims=min_patch_dims, + max_patch_dims=max_patch_dims, + dim_contiguous=dim_contiguous, + data_dims=data_dims, + boundary=boundary, + feature_weight=feature_weight, + kernel="gaussian", + n_kernels=n_kernels, + store_kernel_library=store_kernel_library, + ) + self.mu_bounds = mu_bounds + self.sigma_bounds = sigma_bounds + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples a gaussian kernel library.""" + rng = np.random.default_rng(self.random_state) + kernel_library = [] + kernel_params = [] + kernel_dims = [] + + # Sample the kernel library + ndim = len(self.data_dims_) + for _ in range(self.n_kernels): + patch_shape = [] + for idx in range(ndim): + # Note: By constraining max patch height/width to be at least the min + # patch height/width this ensures that the minimum value of + # patch_height and patch_width is 1 + patch_dim = rng.randint(self.min_patch_dims_[idx], self.max_patch_dims_[idx] + 1) + + # sample the possible patch shape given input parameters + if self.boundary == "wrap": + # add circular boundary conditions + delta_patch_dim = self.data_dims_[idx] + 2 * (patch_dim - 1) + + # sample the top left index for this dimension + top_left_patch_seed = rng.randint(0, delta_patch_dim) + + # resample the patch dimension due to padding + dim = top_left_patch_seed % delta_patch_dim + + # resample the patch dimension due to padding + patch_dim = min( + patch_dim, min(dim + 1, self.data_dims_[idx] + patch_dim - dim - 1) + ) + + patch_shape.append(patch_dim) + + patch_shape = np.array(patch_shape) + + # sample the sigma and mu parameters + sigma = rng.uniform(low=self.mu_bounds[0], high=self.mu_bounds[1]) + mu = rng.uniform(low=self.sigma_bounds[0], high=self.sigma_bounds[1]) + + kernel = gaussian_kernel(shape=patch_shape, sigma=sigma, mu=mu) + + kernel_dims.append(kernel.shape) + kernel_library.append(kernel) + kernel_params.append({"shape": patch_shape, "sigma": sigma, "mu": mu}) + + return kernel_library, kernel_dims, kernel_params diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd index a0a61a4de..ae5135980 100644 --- a/treeple/tree/manifold/_morf_splitter.pxd +++ b/treeple/tree/manifold/_morf_splitter.pxd @@ -73,9 +73,14 @@ cdef class PatchSplitter(BestObliqueSplitter): ) noexcept nogil -# cdef class UserKernelSplitter(PatchSplitter): -# """A class to hold user-specified kernels.""" -# cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels +ctypedef float32_t* float32_t_ptr +ctypedef intp_t* inpt_t_ptr + +cdef class UserKernelSplitter(PatchSplitter): + """A class to hold user-specified kernels.""" + # cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels + cdef vector[float32_t*] kernel_dictionary # A list of C-contiguous 2D kernels + cdef vector[intp_t*] kernel_dims # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` cdef class GaussianKernelSplitter(PatchSplitter): diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index 414062a5e..2cd3f5c4c 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -521,3 +521,112 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): Whether or not a feature has missing values. """ self.init(X, y, sample_weight, missing_values_in_feature_mask) + + +cdef class UserKernelSplitter(PatchSplitter): + def __cinit__( + self, + criterion: Criterion, + max_features: inpt_t, + min_samples_leaf: inpt_t, + min_weight_leaf: double, + random_state: object, + min_patch_dims: inpt_t, + max_patch_dims: inpt_t, + dim_contiguous: cnp.uint8_t, + data_dims: inpt_t, + boundary: str, + feature_weight: float32_t, + kernel_dictionary: object, + kernel_dims: object, + *argv + ): + # initialize the kernel dictionary into a vector to allow Cython to see it + # see: https://stackoverflow.com/questions/46240207/passing-list-of-numpy-arrays-to-c-using-cython + cdef int n_arrays = len(kernel_dictionary) + self.kernel_dictionary = vector[float32_t_ptr](n_arrays) # A list of C-contiguous 2D kernels + self.kernel_dims = vector[inpt_t_ptr](n_arrays) # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + + # buffers to point to each element in the list + cdef float32_t[:] kernel + cdef inpt_t[:] kernel_dim + + cdef int i + for i in range(n_arrays): + kernel = kernel_dictionary[i] + kernel_dim = kernel_dims[i] + + # store a pointer to the data + self.kernel_dictionary.push_back(&kernel[0]) + self.kernel_dims.push_back(&kernel_dim[0]) + + cdef void sample_proj_mat( + self, + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[inpt_t]]& proj_mat_indices + ) noexcept nogil: + """Sample projection matrix using a contiguous patch. + Randomly sample patches with weight of 1. + """ + cdef inpt_t max_features = self.max_features + cdef int proj_i + + # define parameters for vectorized points in the original data shape + # and top-left seed + cdef inpt_t top_left_patch_seed + + # size of the sampled patch, which is just the size of the n-dim patch + # (\prod_i self.patch_dims_buff[i]) + cdef inpt_t patch_size + + cdef float32_t[:] kernel + cdef inpt_t[:] kernel_dim + + for proj_i in range(0, max_features): + # now get the top-left seed that is used to then determine the top-left + # position in patch + # compute top-left seed for the multi-dimensional patch + top_left_patch_seed, patch_size = self.sample_top_left_seed() + + # sample a random index in the kernel library + # kernel_idx = + + # get that kernel and add it to the projection vector indices and weights + kernel = self.kernel_dictionary[kernel_idx] + kernel_dim = self.kernel_dims[kernel_idx] + + # convert top-left-patch-seed to unraveled indices + # get the top-left index in the original data + top_left_idx = self.unravel_index(top_left_patch_seed, self.data_dims_buff, self.ndim) + + # loop over the kernel and add the weights and indices to the projection + for idim in range(self.ndim): + # get the dimension of the kernel + kernel_dim = self.kernel_dims[kernel_idx][idim] + + # get the top-left index in the kernel + top_left_kernel_idx = self.unravel_index(top_left_patch_seed, kernel_dim, self.ndim) + + # loop over the kernel and add the weights and indices to the projection + # for i in range(kernel_dim): + # # get the index in the original data + # idx = self.ravel_multi_index(top_left_idx, self.data_dims_buff, self.ndim) + + # # get the index in the kernel + # kernel_idx = self.ravel_multi_index(top_left_kernel_idx, kernel_dim, self.ndim) + + # # add the weight and index to the projection matrix + # proj_mat_weights[proj_i].push_back(kernel[kernel_idx]) + # proj_mat_indices[proj_i].push_back(idx) + + # # increment the top-left index in the original data + # top_left_idx[idim] += 1 + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += 1 + + # # increment the top-left index in the original data + # top_left_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += self.patch_dims_buff[idim] - kernel_dim From a365c2923d67c6b36b037da34cc0e240cb22790d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 16:08:46 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/manifold/plot_kernel_decision_tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/manifold/plot_kernel_decision_tree.py b/examples/manifold/plot_kernel_decision_tree.py index 954aef7be..35abb9740 100644 --- a/examples/manifold/plot_kernel_decision_tree.py +++ b/examples/manifold/plot_kernel_decision_tree.py @@ -15,11 +15,11 @@ See :ref:`sphx_glr_auto_examples_plot_projection_matrices` for more information on projection matrices and the way they can be sampled. """ + import matplotlib.pyplot as plt # %% # Importing the necessary modules -import numpy as np from sklearn.datasets import fetch_openml from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split @@ -103,4 +103,4 @@ for i, kernel in enumerate(important_kernels): axes[i].imshow(kernel, cmap="gray") axes[i].set_title("Kernel {}".format(i + 1)) -plt.show() \ No newline at end of file +plt.show() From ea97670f7417a84e23554e3d4d3b282466bb632b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 24 Jul 2024 10:05:33 -0400 Subject: [PATCH 3/7] Adding example Signed-off-by: Adam Li --- examples/manifold/plot_kernel_decision_tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/manifold/plot_kernel_decision_tree.py b/examples/manifold/plot_kernel_decision_tree.py index 954aef7be..35abb9740 100644 --- a/examples/manifold/plot_kernel_decision_tree.py +++ b/examples/manifold/plot_kernel_decision_tree.py @@ -15,11 +15,11 @@ See :ref:`sphx_glr_auto_examples_plot_projection_matrices` for more information on projection matrices and the way they can be sampled. """ + import matplotlib.pyplot as plt # %% # Importing the necessary modules -import numpy as np from sklearn.datasets import fetch_openml from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split @@ -103,4 +103,4 @@ for i, kernel in enumerate(important_kernels): axes[i].imshow(kernel, cmap="gray") axes[i].set_title("Kernel {}".format(i + 1)) -plt.show() \ No newline at end of file +plt.show() From 7d74b36bb046e8fedd5342edf98f3b0270dd3c42 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 9 Aug 2024 12:56:42 -0400 Subject: [PATCH 4/7] Refactoring to make it easier to understand Signed-off-by: Adam Li --- pyproject.toml | 1 + treeple/tree/__init__.py | 2 +- treeple/tree/_classes.py | 503 +------------------------------------- treeple/tree/_kernel.py | 513 +++++++++++++++++++++++++++++++++++++++ treeple/tree/kernels.py | 12 - treeple/tree/meson.build | 1 + 6 files changed, 519 insertions(+), 513 deletions(-) create mode 100644 treeple/tree/_kernel.py delete mode 100644 treeple/tree/kernels.py diff --git a/pyproject.toml b/pyproject.toml index 596d2408b..fe7ba4181 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -195,6 +195,7 @@ ignore_missing_imports = true no_site_packages = true exclude = [ 'treeple/_lib/', + 'treeple/_lib/*', 'benchmarks_nonasv/' ] diff --git a/treeple/tree/__init__.py b/treeple/tree/__init__.py index d6a4beab7..a7425c213 100644 --- a/treeple/tree/__init__.py +++ b/treeple/tree/__init__.py @@ -7,7 +7,6 @@ from ._classes import ( ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor, - KernelDecisionTreeClassifier, ObliqueDecisionTreeClassifier, ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeClassifier, @@ -16,6 +15,7 @@ UnsupervisedObliqueDecisionTree, ) from ._honest_tree import HonestTreeClassifier +from ._kernel import KernelDecisionTreeClassifier from ._multiview import MultiViewDecisionTreeClassifier from ._neighbors import compute_forest_similarity_matrix diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py index 76a9052a2..1ad36a8ee 100644 --- a/treeple/tree/_classes.py +++ b/treeple/tree/_classes.py @@ -23,7 +23,6 @@ from ._neighbors import SimMatrixMixin from ._oblique_splitter import ObliqueSplitter from ._oblique_tree import ObliqueTree -from .kernels import gaussian_kernel from .manifold import _morf_splitter from .manifold._morf_splitter import PatchSplitter from .unsupervised import _unsup_criterion, _unsup_oblique_splitter, _unsup_splitter @@ -1284,7 +1283,7 @@ class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), @@ -1685,7 +1684,7 @@ class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier) """ tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeClassifier._parameter_constraints, "min_patch_dims": ["array-like", None], "max_patch_dims": ["array-like", None], @@ -2167,7 +2166,7 @@ class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """ tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "min_patch_dims": ["array-like", None], "max_patch_dims": ["array-like", None], @@ -3238,499 +3237,3 @@ def _build_tree( builder.build(self.tree_, X, y, sample_weight) return self - - -class KernelDecisionTreeClassifier(PatchObliqueDecisionTreeClassifier): - """Oblique decision tree classifier over data patches combined with Gaussian kernels. - Parameters - ---------- - criterion : {"gini", "entropy"}, default="gini" - The function to measure the quality of a split. Supported criteria are - "gini" for the Gini impurity and "entropy" for the information gain. - splitter : {"best", "random"}, default="best" - The strategy used to choose the split at each node. Supported - strategies are "best" to choose the best split and "random" to choose - the best random split. - max_depth : int, default=None - The maximum depth of the tree. If None, then nodes are expanded until - all leaves are pure or until all leaves contain less than - min_samples_split samples. - min_samples_split : int or float, default=2 - The minimum number of samples required to split an internal node: - - If int, then consider `min_samples_split` as the minimum number. - - If float, then `min_samples_split` is a fraction and - `ceil(min_samples_split * n_samples)` are the minimum - number of samples for each split. - min_samples_leaf : int or float, default=1 - The minimum number of samples required to be at a leaf node. - A split point at any depth will only be considered if it leaves at - least ``min_samples_leaf`` training samples in each of the left and - right branches. This may have the effect of smoothing the model, - especially in regression. - - If int, then consider `min_samples_leaf` as the minimum number. - - If float, then `min_samples_leaf` is a fraction and - `ceil(min_samples_leaf * n_samples)` are the minimum - number of samples for each node. - min_weight_fraction_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights (of all - the input samples) required to be at a leaf node. Samples have - equal weight when sample_weight is not provided. - max_features : int, float or {"auto", "sqrt", "log2"}, default=None - The number of features to consider when looking for the best split: - - If int, then consider `max_features` features at each split. - - If float, then `max_features` is a fraction and - `int(max_features * n_features)` features are considered at each - split. - - If "auto", then `max_features=sqrt(n_features)`. - - If "sqrt", then `max_features=sqrt(n_features)`. - - If "log2", then `max_features=log2(n_features)`. - - If None, then `max_features=n_features`. - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires to - effectively inspect more than ``max_features`` features. - Note: Compared to axis-aligned Random Forests, one can set - max_features to a number greater then ``n_features``. - random_state : int, RandomState instance or None, default=None - Controls the randomness of the estimator. The features are always - randomly permuted at each split, even if ``splitter`` is set to - ``"best"``. When ``max_features < n_features``, the algorithm will - select ``max_features`` at random at each split before finding the best - split among them. But the best found split may vary across different - runs, even if ``max_features=n_features``. That is the case, if the - improvement of the criterion is identical for several splits and one - split has to be selected at random. To obtain a deterministic behaviour - during fitting, ``random_state`` has to be fixed to an integer. - See :term:`Glossary ` for details. - max_leaf_nodes : int, default=None - Grow a tree with ``max_leaf_nodes`` in best-first fashion. - Best nodes are defined as relative reduction in impurity. - If None then unlimited number of leaf nodes. - min_impurity_decrease : float, default=0.0 - A node will be split if this split induces a decrease of the impurity - greater than or equal to this value. - The weighted impurity decrease equation is the following:: - N_t / N * (impurity - N_t_R / N_t * right_impurity - - N_t_L / N_t * left_impurity) - where ``N`` is the total number of samples, ``N_t`` is the number of - samples at the current node, ``N_t_L`` is the number of samples in the - left child, and ``N_t_R`` is the number of samples in the right child. - ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, - if ``sample_weight`` is passed. - class_weight : dict, list of dict or "balanced", default=None - Weights associated with classes in the form ``{class_label: weight}``. - If None, all classes are supposed to have weight one. For - multi-output problems, a list of dicts can be provided in the same - order as the columns of y. - Note that for multioutput (including multilabel) weights should be - defined for each class of every column in its own dict. For example, - for four-class multilabel classification weights should be - [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of - [{1:1}, {2:5}, {3:1}, {4:1}]. - The "balanced" mode uses the values of y to automatically adjust - weights inversely proportional to class frequencies in the input data - as ``n_samples / (n_classes * np.bincount(y))`` - For multi-output, the weights of each column of y will be multiplied. - Note that these weights will be multiplied with sample_weight (passed - through the fit method) if sample_weight is specified. - min_patch_dim : array-like, optional - The minimum dimensions of a patch, by default 1 along all dimensions. - max_patch_dim : array-like, optional - The maximum dimensions of a patch, by default 1 along all dimensions. - dim_contiguous : array-like of bool, optional - Whether or not each patch is sampled contiguously along this dimension. - data_dims : array-like, optional - The presumed dimensions of the un-vectorized feature vector, by default - will be a 1D vector with (1, n_features) shape. - boundary : optional, str {'wrap'} - The boundary condition to use when sampling patches, by default None. - 'wrap' corresponds to the boundary condition as is in numpy and scipy. - feature_weight : array-like of shape (n_samples,n_features,), default=None - Feature weights. If None, then features are equally weighted as is. - If provided, then the feature weights are used to weight the - patches that are generated. The feature weights are used - as follows: for every patch that is sampled, the feature weights over - the entire patch is summed and normalizes the patch. - kernel : str {'gaussian', 'uniform'}, default='gaussian' - The kernel to use. - n_kernels : int, optional - The number of different kernels to generate. This number should be very high - as this generates kernels - Attributes - ---------- - classes_ : ndarray of shape (n_classes,) or list of ndarray - The classes labels (single output problem), - or a list of arrays of class labels (multi-output problem). - feature_importances_ : ndarray of shape (n_features,) - The impurity-based feature importances. - The higher, the more important the feature. - The importance of a feature is computed as the (normalized) - total reduction of the criterion brought by that feature. It is also - known as the Gini importance [4]_. - Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). See - :func:`sklearn.inspection.permutation_importance` as an alternative. - max_features_ : int - The inferred value of max_features. - n_classes_ : int or list of int - The number of classes (for single output problems), - or a list containing the number of classes for each - output (for multi-output problems). - n_features_in_ : int - Number of features seen during :term:`fit`. - feature_names_in_ : ndarray of shape (`n_features_in_`,) - Names of features seen during :term:`fit`. Defined only when `X` - has feature names that are all strings. - n_outputs_ : int - The number of outputs when ``fit`` is performed. - tree_ : Tree instance - The underlying Tree object. Please refer to - ``help(sklearn.tree._tree.Tree)`` for - attributes of Tree object. - min_patch_dims_ : array-like - The minimum dimensions of a patch. - max_patch_dims_ : array-like - The maximum dimensions of a patch. - data_dims_ : array-like - The presumed dimensions of the un-vectorized feature vector. - kernel_arr_ : list of length (n_nodes,) of array-like of shape (patch_dims,) - The kernel array that is applied to the patches for this tree. - The order is in the same order in which the tree traverses the nodes. - kernel_params_ : list of length (n_nodes,) - The parameters of the kernel that is applied to the patches for this tree. - The order is in the same order in which the tree traverses the nodes. - kernel_library_ : array-like of shape (n_kernels,), optional - The library of kernels that was chosen from the patches for this tree. - Only stored if ``store_kernel_library`` is set to True. - kernel_library_params_ : list of length (n_nodes,), optional - The parameters of the kernels that was chosen from the patches for this tree. - The order is in the same order in which the tree traverses the nodes. - Only stored if ``store_kernel_library`` is set to True. - References - ---------- - .. footbibliography:: - """ - - _parameter_constraints = { - **DecisionTreeClassifier._parameter_constraints, - "min_patch_dims": ["array-like", None], - "max_patch_dims": ["array-like", None], - "data_dims": ["array-like", None], - "dim_contiguous": ["array-like", None], - "boundary": [str, None], - "feature_weight": ["array-like", None], - "kernel": ["str"], - "n_kernels": [int, None], - "store_kernel_library": [bool], - } - - def __init__( - self, - *, - criterion="gini", - splitter="best", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features=None, - random_state=None, - max_leaf_nodes=None, - min_impurity_decrease=0.0, - class_weight=None, - min_patch_dims=None, - max_patch_dims=None, - dim_contiguous=None, - data_dims=None, - boundary=None, - feature_weight=None, - kernel="gaussian", - n_kernels=None, - store_kernel_library=False, - ): - super().__init__( - criterion=criterion, - splitter=splitter, - max_depth=max_depth, - min_samples_split=min_samples_split, - min_samples_leaf=min_samples_leaf, - min_weight_fraction_leaf=min_weight_fraction_leaf, - max_features=max_features, - max_leaf_nodes=max_leaf_nodes, - class_weight=class_weight, - random_state=random_state, - min_impurity_decrease=min_impurity_decrease, - ) - - self.min_patch_dims = min_patch_dims - self.max_patch_dims = max_patch_dims - self.dim_contiguous = dim_contiguous - self.data_dims = data_dims - self.boundary = boundary - self.feature_weight = feature_weight - - self.kernel = kernel - self.n_kernel = n_kernels - self.store_kernel_library = store_kernel_library - - def _build_tree( - self, - X, - y, - sample_weight, - min_samples_leaf, - min_weight_leaf, - max_leaf_nodes, - min_samples_split, - max_depth, - random_state, - ): - """Build the actual tree. - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The training input samples. Internally, it will be converted to - ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csc_matrix``. - y : array-like of shape (n_samples,) or (n_samples, n_outputs) - The target values (class labels) as integers or strings. - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. If None, then samples are equally weighted. Splits - that would create child nodes with net zero or negative weight are - ignored while searching for a split in each node. Splits are also - ignored if they would result in any single class carrying a - negative weight in either child node. - min_samples_leaf : int or float - The minimum number of samples required to be at a leaf node. - min_weight_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights. - max_leaf_nodes : int, default=None - Grow a tree with ``max_leaf_nodes`` in best-first fashion. - min_samples_split : int or float, default=2 - The minimum number of samples required to split an internal node: - max_depth : int, default=None - The maximum depth of the tree. If None, then nodes are expanded until - all leaves are pure or until all leaves contain less than - min_samples_split samples. - random_state : int, RandomState instance or None, default=None - Controls the randomness of the estimator. - """ - # Build tree - criterion = self.criterion - if not isinstance(criterion, BaseCriterion): - criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) - else: - # Make a deepcopy in case the criterion has mutable attributes that - # might be shared and modified concurrently during parallel fitting - criterion = copy.deepcopy(criterion) - - splitter = self.splitter - if issparse(X): - raise ValueError( - "Sparse input is not supported for oblique trees. " - "Please convert your data to a dense array." - ) - else: - PATCH_SPLITTERS = PATCH_DENSE_SPLITTERS - - # compute user-defined Kernel library before - kernel_library, kernel_dims, kernel_params = self._sample_kernel_library(X, y) - - # Note: users cannot define a splitter themselves - splitter = PATCH_SPLITTERS[self.splitter]( - criterion, - self.max_features_, - min_samples_leaf, - min_weight_leaf, - random_state, - self.min_patch_dims_, - self.max_patch_dims_, - self.dim_contiguous_, - self.data_dims_, - self.boundary, - self.feature_weight, - kernel_library, - ) - - self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) - - # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise - if max_leaf_nodes < 0: - builder = DepthFirstTreeBuilder( - splitter, - min_samples_split, - min_samples_leaf, - min_weight_leaf, - max_depth, - self.min_impurity_decrease, - ) - else: - builder = BestFirstTreeBuilder( - splitter, - min_samples_split, - min_samples_leaf, - min_weight_leaf, - max_depth, - max_leaf_nodes, - self.min_impurity_decrease, - ) - - builder.build(self.tree_, X, y, sample_weight) - - # Set some attributes based on what kernel indices were used in each tree - # - construct a tree-nodes like array and set it as a Python attribute, so it is - # exposed to the Python interface - # - use the Cython tree.feature array to store the index of the dictionary library - # that was used to split at each node - kernel_idx_chosen = self.tree_.feature - kernel_lib_chosen = kernel_library[kernel_idx_chosen] - kernel_params_chosen = kernel_params[kernel_idx_chosen] - self.kernel_arr_ = kernel_lib_chosen - self.kernel_dims_ = kernel_dims[kernel_idx_chosen] - self.kernel_params_ = kernel_params_chosen - - if self.store_kernel_library: - self.kernel_library_ = kernel_library - self.kernel_idx_chosen_ = kernel_idx_chosen - - if self.n_outputs_ == 1: - self.n_classes_ = self.n_classes_[0] - self.classes_ = self.classes_[0] - - def _sample_kernel_library(self, X, y, sample_weight): - """Samples the dictionary library that is sampled from in the random forest. - A kernel can either be applied with the boundaries of the image in mind, such that - the patch can be uniformly applied across all indices of rows of ``X_structured``. This - is equivalent to passing in ``boundary = 'wrap'``. Alternatively, a kernel can be sampled, - such that the patch is always contained within the boundary of the image. This is equivalent - to passing in ``boundary = None``. - Parameters - ---------- - X : _type_ - _description_ - y : _type_ - _description_ - sample_weight : _type_ - _description_ - Returns - ------- - kernel_library : array-like of shape (n_kernels, n_features) - The dictionary that will be sampled from the random forest. Non-zero entries indicate - where the kernel is applied. This is a n-dimensional kernel matrix that is vectorized - and placed into the original ``X`` data dimensions of (n_dims,). - kernel_dims : list of (n_kernels,) length - The dimensions of each kernel that is sampled. For example, (n_dims_x, n_dims_y) in - the first element indicates that the first kernel has a shape of (n_dims_x, n_dims_y). - This can be used to then place where the top-left seed of each kernel is applied. - kernel_params : list of (n_kernels,) length - A list of dictionaries representing the parameters of each kernel. This is used to - keep track of what kernels and parameterizations were valuable when used in the random - forest. - """ - raise NotImplementedError("This method should be implemented in a child class.") - - -class GaussianKernelDecisionTreeClassifier(KernelDecisionTreeClassifier): - _parameter_constraints = { - **KernelDecisionTreeClassifier._parameter_constraints, - # "mu_bounds": ['array-like'], - # "sigma_bounds": ['array-like'], - } - - def __init__( - self, - *, - criterion="gini", - splitter="best", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0, - max_features=None, - random_state=None, - max_leaf_nodes=None, - min_impurity_decrease=0, - class_weight=None, - min_patch_dims=None, - max_patch_dims=None, - dim_contiguous=None, - data_dims=None, - boundary=None, - feature_weight=None, - n_kernels=None, - store_kernel_library=False, - mu_bounds=(0, 1), - sigma_bounds=(0, 1), - ): - super().__init__( - criterion=criterion, - splitter=splitter, - max_depth=max_depth, - min_samples_split=min_samples_split, - min_samples_leaf=min_samples_leaf, - min_weight_fraction_leaf=min_weight_fraction_leaf, - max_features=max_features, - random_state=random_state, - max_leaf_nodes=max_leaf_nodes, - min_impurity_decrease=min_impurity_decrease, - class_weight=class_weight, - min_patch_dims=min_patch_dims, - max_patch_dims=max_patch_dims, - dim_contiguous=dim_contiguous, - data_dims=data_dims, - boundary=boundary, - feature_weight=feature_weight, - kernel="gaussian", - n_kernels=n_kernels, - store_kernel_library=store_kernel_library, - ) - self.mu_bounds = mu_bounds - self.sigma_bounds = sigma_bounds - - def _sample_kernel_library(self, X, y, sample_weight): - """Samples a gaussian kernel library.""" - rng = np.random.default_rng(self.random_state) - kernel_library = [] - kernel_params = [] - kernel_dims = [] - - # Sample the kernel library - ndim = len(self.data_dims_) - for _ in range(self.n_kernels): - patch_shape = [] - for idx in range(ndim): - # Note: By constraining max patch height/width to be at least the min - # patch height/width this ensures that the minimum value of - # patch_height and patch_width is 1 - patch_dim = rng.randint(self.min_patch_dims_[idx], self.max_patch_dims_[idx] + 1) - - # sample the possible patch shape given input parameters - if self.boundary == "wrap": - # add circular boundary conditions - delta_patch_dim = self.data_dims_[idx] + 2 * (patch_dim - 1) - - # sample the top left index for this dimension - top_left_patch_seed = rng.randint(0, delta_patch_dim) - - # resample the patch dimension due to padding - dim = top_left_patch_seed % delta_patch_dim - - # resample the patch dimension due to padding - patch_dim = min( - patch_dim, min(dim + 1, self.data_dims_[idx] + patch_dim - dim - 1) - ) - - patch_shape.append(patch_dim) - - patch_shape = np.array(patch_shape) - - # sample the sigma and mu parameters - sigma = rng.uniform(low=self.mu_bounds[0], high=self.mu_bounds[1]) - mu = rng.uniform(low=self.sigma_bounds[0], high=self.sigma_bounds[1]) - - kernel = gaussian_kernel(shape=patch_shape, sigma=sigma, mu=mu) - - kernel_dims.append(kernel.shape) - kernel_library.append(kernel) - kernel_params.append({"shape": patch_shape, "sigma": sigma, "mu": mu}) - - return kernel_library, kernel_dims, kernel_params diff --git a/treeple/tree/_kernel.py b/treeple/tree/_kernel.py new file mode 100644 index 000000000..d46448f70 --- /dev/null +++ b/treeple/tree/_kernel.py @@ -0,0 +1,513 @@ +import copy + +import numpy as np +from scipy.sparse import issparse + +from .._lib.sklearn.tree._criterion import BaseCriterion +from .._lib.sklearn.tree._tree import BestFirstTreeBuilder, DepthFirstTreeBuilder +from ._classes import CRITERIA_CLF, PATCH_DENSE_SPLITTERS, PatchObliqueDecisionTreeClassifier +from ._oblique_tree import ObliqueTree + + +def gaussian_kernel(shape, sigma=1.0, mu=0.0): + """N-dimensional gaussian kernel for the given shape. + + See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e + """ + m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape]) + d = np.sqrt(np.sum([x * x for x in m], axis=0)) + g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2))) + return g / np.sum(g) + + +class KernelDecisionTreeClassifier(PatchObliqueDecisionTreeClassifier): + """Oblique decision tree classifier over data patches combined with Gaussian kernels. + + Parameters + ---------- + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + splitter : {"best", "random"}, default="best" + The strategy used to choose the split at each node. Supported + strategies are "best" to choose the best split and "random" to choose + the best random split. + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + max_features : int, float or {"auto", "sqrt", "log2"}, default=None + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `int(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + Note: Compared to axis-aligned Random Forests, one can set + max_features to a number greater then ``n_features``. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + The weighted impurity decrease equation is the following:: + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + class_weight : dict, list of dict or "balanced", default=None + Weights associated with classes in the form ``{class_label: weight}``. + If None, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + For multi-output, the weights of each column of y will be multiplied. + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + min_patch_dim : array-like, optional + The minimum dimensions of a patch, by default 1 along all dimensions. + max_patch_dim : array-like, optional + The maximum dimensions of a patch, by default 1 along all dimensions. + dim_contiguous : array-like of bool, optional + Whether or not each patch is sampled contiguously along this dimension. + data_dims : array-like, optional + The presumed dimensions of the un-vectorized feature vector, by default + will be a 1D vector with (1, n_features) shape. + boundary : optional, str {'wrap'} + The boundary condition to use when sampling patches, by default None. + 'wrap' corresponds to the boundary condition as is in numpy and scipy. + feature_weight : array-like of shape (n_samples,n_features,), default=None + Feature weights. If None, then features are equally weighted as is. + If provided, then the feature weights are used to weight the + patches that are generated. The feature weights are used + as follows: for every patch that is sampled, the feature weights over + the entire patch is summed and normalizes the patch. + kernel : str {'gaussian', 'uniform'}, default='gaussian' + The kernel to use. + n_kernels : int, optional + The number of different kernels to generate. This number should be very high + as this generates kernels + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) or list of ndarray + The classes labels (single output problem), + or a list of arrays of class labels (multi-output problem). + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance [4]_. + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + max_features_ : int + The inferred value of max_features. + n_classes_ : int or list of int + The number of classes (for single output problems), + or a list containing the number of classes for each + output (for multi-output problems). + n_features_in_ : int + Number of features seen during :term:`fit`. + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + n_outputs_ : int + The number of outputs when ``fit`` is performed. + tree_ : Tree instance + The underlying Tree object. Please refer to + ``help(sklearn.tree._tree.Tree)`` for + attributes of Tree object. + min_patch_dims_ : array-like + The minimum dimensions of a patch. + max_patch_dims_ : array-like + The maximum dimensions of a patch. + data_dims_ : array-like + The presumed dimensions of the un-vectorized feature vector. + kernel_arr_ : list of length (n_nodes,) of array-like of shape (patch_dims,) + The kernel array that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + kernel_params_ : list of length (n_nodes,) + The parameters of the kernel that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + kernel_library_ : array-like of shape (n_kernels,), optional + The library of kernels that was chosen from the patches for this tree. + Only stored if ``store_kernel_library`` is set to True. + kernel_library_params_ : list of length (n_nodes,), optional + The parameters of the kernels that was chosen from the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + Only stored if ``store_kernel_library`` is set to True. + References + ---------- + .. footbibliography:: + """ + + _parameter_constraints: dict = {**PatchObliqueDecisionTreeClassifier._parameter_constraints} + _parameter_constraints.update( + { + "kernel": ["str"], + "n_kernels": [int, None], + "store_kernel_library": [bool], + } + ) + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + kernel="gaussian", + n_kernels=None, + store_kernel_library=False, + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, + random_state=random_state, + min_impurity_decrease=min_impurity_decrease, + ) + + self.min_patch_dims = min_patch_dims + self.max_patch_dims = max_patch_dims + self.dim_contiguous = dim_contiguous + self.data_dims = data_dims + self.boundary = boundary + self.feature_weight = feature_weight + + self.kernel = kernel + self.n_kernel = n_kernels + self.store_kernel_library = store_kernel_library + + def _build_tree( + self, + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + min_samples_leaf : int or float + The minimum number of samples required to be at a leaf node. + min_weight_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights. + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. + """ + # Build tree + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + splitter = self.splitter + if issparse(X): + raise ValueError( + "Sparse input is not supported for oblique trees. " + "Please convert your data to a dense array." + ) + else: + PATCH_SPLITTERS = PATCH_DENSE_SPLITTERS + + # compute user-defined Kernel library before + kernel_library, kernel_dims, kernel_params = self._sample_kernel_library(X, y) + + # Note: users cannot define a splitter themselves + splitter = PATCH_SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + self.min_patch_dims_, + self.max_patch_dims_, + self.dim_contiguous_, + self.data_dims_, + self.boundary, + self.feature_weight, + kernel_library, + ) + + self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + ) + + builder.build(self.tree_, X, y, sample_weight) + + # Set some attributes based on what kernel indices were used in each tree + # - construct a tree-nodes like array and set it as a Python attribute, so it is + # exposed to the Python interface + # - use the Cython tree.feature array to store the index of the dictionary library + # that was used to split at each node + kernel_idx_chosen = self.tree_.feature + kernel_lib_chosen = kernel_library[kernel_idx_chosen] + kernel_params_chosen = kernel_params[kernel_idx_chosen] + self.kernel_arr_ = kernel_lib_chosen + self.kernel_dims_ = kernel_dims[kernel_idx_chosen] + self.kernel_params_ = kernel_params_chosen + + if self.store_kernel_library: + self.kernel_library_ = kernel_library + self.kernel_idx_chosen_ = kernel_idx_chosen + + if self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples the dictionary library that is sampled from in the random forest. + A kernel can either be applied with the boundaries of the image in mind, such that + the patch can be uniformly applied across all indices of rows of ``X_structured``. This + is equivalent to passing in ``boundary = 'wrap'``. Alternatively, a kernel can be sampled, + such that the patch is always contained within the boundary of the image. This is equivalent + to passing in ``boundary = None``. + Parameters + ---------- + X : _type_ + _description_ + y : _type_ + _description_ + sample_weight : _type_ + _description_ + Returns + ------- + kernel_library : array-like of shape (n_kernels, n_features) + The dictionary that will be sampled from the random forest. Non-zero entries indicate + where the kernel is applied. This is a n-dimensional kernel matrix that is vectorized + and placed into the original ``X`` data dimensions of (n_dims,). + kernel_dims : list of (n_kernels,) length + The dimensions of each kernel that is sampled. For example, (n_dims_x, n_dims_y) in + the first element indicates that the first kernel has a shape of (n_dims_x, n_dims_y). + This can be used to then place where the top-left seed of each kernel is applied. + kernel_params : list of (n_kernels,) length + A list of dictionaries representing the parameters of each kernel. This is used to + keep track of what kernels and parameterizations were valuable when used in the random + forest. + """ + raise NotImplementedError("This method should be implemented in a child class.") + + +class GaussianKernelDecisionTreeClassifier(KernelDecisionTreeClassifier): + _parameter_constraints = { + **KernelDecisionTreeClassifier._parameter_constraints, + # "mu_bounds": ['array-like'], + # "sigma_bounds": ['array-like'], + } + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + n_kernels=None, + store_kernel_library=False, + mu_bounds=(0, 1), + sigma_bounds=(0, 1), + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + random_state=random_state, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + class_weight=class_weight, + min_patch_dims=min_patch_dims, + max_patch_dims=max_patch_dims, + dim_contiguous=dim_contiguous, + data_dims=data_dims, + boundary=boundary, + feature_weight=feature_weight, + kernel="gaussian", + n_kernels=n_kernels, + store_kernel_library=store_kernel_library, + ) + self.mu_bounds = mu_bounds + self.sigma_bounds = sigma_bounds + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples a gaussian kernel library.""" + rng = np.random.default_rng(self.random_state) + kernel_library = [] + kernel_params = [] + kernel_dims = [] + + # Sample the kernel library + ndim = len(self.data_dims_) + for _ in range(self.n_kernels): + patch_shape = [] + for idx in range(ndim): + # Note: By constraining max patch height/width to be at least the min + # patch height/width this ensures that the minimum value of + # patch_height and patch_width is 1 + patch_dim = rng.randint(self.min_patch_dims_[idx], self.max_patch_dims_[idx] + 1) + + # sample the possible patch shape given input parameters + if self.boundary == "wrap": + # add circular boundary conditions + delta_patch_dim = self.data_dims_[idx] + 2 * (patch_dim - 1) + + # sample the top left index for this dimension + top_left_patch_seed = rng.randint(0, delta_patch_dim) + + # resample the patch dimension due to padding + dim = top_left_patch_seed % delta_patch_dim + + # resample the patch dimension due to padding + patch_dim = min( + patch_dim, min(dim + 1, self.data_dims_[idx] + patch_dim - dim - 1) + ) + + patch_shape.append(patch_dim) + + patch_shape = np.array(patch_shape) + + # sample the sigma and mu parameters + sigma = rng.uniform(low=self.mu_bounds[0], high=self.mu_bounds[1]) + mu = rng.uniform(low=self.sigma_bounds[0], high=self.sigma_bounds[1]) + + kernel = gaussian_kernel(shape=patch_shape, sigma=sigma, mu=mu) + + kernel_dims.append(kernel.shape) + kernel_library.append(kernel) + kernel_params.append({"shape": patch_shape, "sigma": sigma, "mu": mu}) + + return kernel_library, kernel_dims, kernel_params diff --git a/treeple/tree/kernels.py b/treeple/tree/kernels.py deleted file mode 100644 index 6aab826a9..000000000 --- a/treeple/tree/kernels.py +++ /dev/null @@ -1,12 +0,0 @@ -import numpy as np - - -def gaussian_kernel(shape, sigma=1.0, mu=0.0): - """N-dimensional gaussian kernel for the given shape. - - See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e - """ - m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape]) - d = np.sqrt(np.sum([x * x for x in m], axis=0)) - g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2))) - return g / np.sum(g) diff --git a/treeple/tree/meson.build b/treeple/tree/meson.build index 2febbbc6a..c77266a4c 100644 --- a/treeple/tree/meson.build +++ b/treeple/tree/meson.build @@ -36,6 +36,7 @@ python_sources = [ '_neighbors.py', '_honest_tree.py', '_marginalize.py', + '_kernel.py', ] py.install_sources( From a201ba1b20c8538fd7e204ae1efc8309817abcad Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 28 Aug 2024 11:40:33 -0400 Subject: [PATCH 5/7] new wip Signed-off-by: Adam Li --- examples/splitters/plot_kernel_splitters.py | 77 +++++ treeple/tree/_kernel.py | 84 ++++- treeple/tree/_oblique_splitter.pyx | 2 +- treeple/tree/_utils.pxd | 16 +- treeple/tree/_utils.pyx | 40 ++- treeple/tree/manifold/_kernel_splitter.pxd | 37 +++ treeple/tree/manifold/_kernel_splitter.pyx | 351 ++++++++++++++++++++ treeple/tree/manifold/_morf_splitter.pxd | 25 -- treeple/tree/manifold/_morf_splitter.pyx | 122 +------ treeple/tree/manifold/meson.build | 3 + treeple/tree/tests/meson.build | 1 + treeple/tree/tests/test_kernel.py | 52 +++ 12 files changed, 662 insertions(+), 148 deletions(-) create mode 100644 examples/splitters/plot_kernel_splitters.py create mode 100644 treeple/tree/manifold/_kernel_splitter.pxd create mode 100644 treeple/tree/manifold/_kernel_splitter.pyx create mode 100644 treeple/tree/tests/test_kernel.py diff --git a/examples/splitters/plot_kernel_splitters.py b/examples/splitters/plot_kernel_splitters.py new file mode 100644 index 000000000..d95a388e6 --- /dev/null +++ b/examples/splitters/plot_kernel_splitters.py @@ -0,0 +1,77 @@ +""" +==================== +Random Kernel Splits +==================== + +This example shows how to build a manifold oblique decision tree classifier using +a custom set of user-defined kernel/filter library, such as the Gaussian, or Gabor +kernels. + +The example demonstrates superior performance on a 2D dataset with structured images +as samples. The dataset is the downsampled MNIST dataset, where each sample is a +28x28 image. The dataset is downsampled to 14x14, and then flattened to a 196 +dimensional vector. The dataset is then split into a training and testing set. + +See :ref:`sphx_glr_auto_examples_plot_projection_matrices` for more information on +projection matrices and the way they can be sampled. +""" + +import matplotlib.pyplot as plt +import numpy as np +from scipy.sparse import csr_matrix + +from treeple.tree.manifold._kernel_splitter import Kernel2D + +# %% +# Create a synthetic image +image_height, image_width = 50, 50 +image = np.random.rand(image_height, image_width).astype(np.float32) + +# Generate a Gaussian kernel (example) +kernel_size = 7 +x = np.linspace(-2, 2, kernel_size) +y = np.linspace(-2, 2, kernel_size) +x, y = np.meshgrid(x, y) +kernel = np.exp(-(x**2 + y**2)) +kernel = kernel / kernel.sum() # Normalize the kernel + +# Vectorize and create a sparse CSR matrix +kernel_vector = kernel.flatten().astype(np.float32) +kernel_indices = np.arange(kernel_vector.size) +kernel_indptr = np.array([0, kernel_vector.size]) +kernel_csr = csr_matrix( + (kernel_vector, kernel_indices, kernel_indptr), shape=(1, kernel_vector.size) +) + +# %% +# Initialize the Kernel2D class +kernel_sizes = np.array([kernel_size], dtype=np.intp) +random_state = np.random.RandomState(42) +print(kernel_csr.dtype, kernel_sizes.dtype, np.intp) +kernel_2d = Kernel2D(kernel_csr, kernel_sizes, random_state) + +# Apply the kernel to the image +result_value = kernel_2d.apply_kernel_py(image, 0) + +# %% +# Plot the original image, kernel, and result +fig, axs = plt.subplots(1, 3, figsize=(15, 5)) + +axs[0].imshow(image, cmap="gray") +axs[0].set_title("Original Image") + +axs[1].imshow(kernel, cmap="viridis") +axs[1].set_title("Gaussian Kernel") + +# Highlight the region where the kernel was applied +start_x, start_y = random_state.randint(0, image_width - kernel_size + 1), random_state.randint( + 0, image_height - kernel_size + 1 +) +image_with_kernel = image.copy() +image_with_kernel[start_y : start_y + kernel_size, start_x : start_x + kernel_size] *= kernel + +axs[2].imshow(image_with_kernel, cmap="gray") +axs[2].set_title(f"Result: {result_value:.4f}") + +plt.tight_layout() +plt.show() diff --git a/treeple/tree/_kernel.py b/treeple/tree/_kernel.py index d46448f70..7b7347a3b 100644 --- a/treeple/tree/_kernel.py +++ b/treeple/tree/_kernel.py @@ -1,7 +1,7 @@ import copy import numpy as np -from scipy.sparse import issparse +from scipy.sparse import csr_matrix, issparse from .._lib.sklearn.tree._criterion import BaseCriterion from .._lib.sklearn.tree._tree import BestFirstTreeBuilder, DepthFirstTreeBuilder @@ -20,6 +20,88 @@ def gaussian_kernel(shape, sigma=1.0, mu=0.0): return g / np.sum(g) +def sample_gaussian_kernels(n_kernels, min_size, max_size, mu=0.0, sigma=1.0): + """ + Sample a set of Gaussian kernels and arrange them into a sparse kernel matrix. + + Parameters: + ----------- + n_kernels : int + The number of Gaussian kernels to sample. + min_size : int + The minimum size of the kernel (inclusive). + max_size : int + The maximum size of the kernel (inclusive). + mu : float or tuple of floats + The mean(s) of the Gaussian distribution. If a tuple, random mean values are sampled. + sigma : float or tuple of floats + The standard deviation(s) of the Gaussian distribution. If a tuple, random sigma values are sampled. + + Returns: + -------- + kernel_matrix : csr_matrix + The sparse matrix containing vectorized kernels. + kernel_params : dict of arrays + The parameters of the kernels that were sampled with keys: + - 'size': the size of each kernel; since the kernels are 2D square matrices + this is the side length of the kernel. + - 'mu': the mean of each kernel + - 'sigma': the standard deviation + """ + data = [] + indices = [] + indptr = [0] + kernel_params = { + "size": np.zeros(n_kernels, dtype=np.intp), + "mu": np.zeros(n_kernels), + "sigma": np.zeros(n_kernels), + } + + for i in range(n_kernels): + # Sample the size of the kernel + size = np.random.randint(min_size, max_size + 1) + + # Sample mu and sigma if they are tuples + if isinstance(mu, tuple): + mu_sample = np.random.uniform(mu[0], mu[1]) + else: + mu_sample = mu + + if isinstance(sigma, tuple): + sigma_sample = np.random.uniform(sigma[0], sigma[1]) + else: + sigma_sample = sigma + + # Create a meshgrid for the kernel + x = np.linspace(-1, 1, size) + y = np.linspace(-1, 1, size) + X, Y = np.meshgrid(x, y) + + # Create the Gaussian kernel + kernel = np.exp(-((X - mu_sample) ** 2 + (Y - mu_sample) ** 2) / (2 * sigma_sample**2)) + + # Vectorize the kernel and store it in the sparse matrix format + kernel_vectorized = kernel.flatten() + data.extend(kernel_vectorized) + indices.extend(range(size * size)) + indptr.append(len(data)) + + # Store the kernel parameters + kernel_params["size"][i] = size + kernel_params["mu"][i] = mu_sample + kernel_params["sigma"][i] = sigma_sample + + # Convert lists to the appropriate format + data = np.array(data) + indices = np.array(indices) + indptr = np.array(indptr) + + # Create a sparse matrix + kernel_matrix = csr_matrix((data, indices, indptr), shape=(n_kernels, max_size * max_size)) + + return kernel_matrix, kernel_params + + class KernelDecisionTreeClassifier(PatchObliqueDecisionTreeClassifier): """Oblique decision tree classifier over data patches combined with Gaussian kernels. diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index ca77a30ac..cb221643f 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -309,7 +309,7 @@ cdef class BestObliqueSplitter(ObliqueSplitter): cdef intp_t end = self.end # pointer array to store feature values to split on - cdef float32_t[::1] feature_values = self.feature_values + cdef float32_t[::1] feature_values = self.feature_values cdef intp_t max_features = self.max_features cdef intp_t min_samples_leaf = self.min_samples_leaf diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index c814cc166..a436c0599 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -8,7 +8,10 @@ from .._lib.sklearn.tree._splitter cimport SplitRecord from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t -cdef int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil +cdef intp_t rand_weighted_binary( + float64_t p0, + uint32_t* random_state +) noexcept nogil cpdef unravel_index( intp_t index, cnp.ndarray[intp_t, ndim=1] shape @@ -16,6 +19,13 @@ cpdef unravel_index( cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil +cdef void unravel_index_cython( + intp_t index, + const intp_t[:] shape, + const intp_t[:] coords +) noexcept nogil -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil +cdef intp_t ravel_multi_index_cython( + const intp_t[:] coords, + const intp_t[:] shape +) noexcept nogil diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx index 197b82ecf..28c038675 100644 --- a/treeple/tree/_utils.pyx +++ b/treeple/tree/_utils.pyx @@ -5,6 +5,8 @@ # cython: wraparound=False # cython: initializedcheck=False +from libcpp.vector cimport vector + import numpy as np cimport numpy as cnp @@ -14,7 +16,10 @@ cnp.import_array() from .._lib.sklearn.tree._utils cimport rand_uniform -cdef inline int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil: +cdef inline intp_t rand_weighted_binary( + float64_t p0, + uint32_t* random_state +) noexcept nogil: """Sample from integers 0 and 1 with different probabilities. Parameters @@ -83,7 +88,11 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): return ravel_multi_index_cython(coords, shape) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil: +cdef inline void unravel_index_cython( + intp_t index, + const intp_t[:] shape, + const intp_t[:] coords +) noexcept nogil: """Converts a flat index into a tuple of coordinate arrays. Parameters @@ -109,8 +118,11 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co index //= size -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil: - """Converts a tuple of coordinate arrays into a flat index. +cdef inline intp_t ravel_multi_index_cython( + const intp_t[:] coords, + const intp_t[:] shape +) noexcept nogil: + """Converts a tuple of coordinate arrays into a flat index in the vectorized dimension. Parameters ---------- @@ -145,3 +157,23 @@ cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) no flat_index *= shape[i + 1] return flat_index + + +cdef vector[vector[intp_t]] cartesian_cython( + vector[vector[intp_t]] sequences +) noexcept nogil: + cdef vector[vector[intp_t]] results = vector[vector[intp_t]](1) + cdef vector[vector[intp_t]] next_results + for new_values in sequences: + for result in results: + for value in new_values: + result_copy = result + result_copy.push_back(value) + next_results.push_back(result_copy) + results = next_results + next_results.clear() + return results + + +cpdef cartesian_python(vector[vector[intp_t]]& sequences): + return cartesian_cython(sequences) \ No newline at end of file diff --git a/treeple/tree/manifold/_kernel_splitter.pxd b/treeple/tree/manifold/_kernel_splitter.pxd new file mode 100644 index 000000000..d0005d13c --- /dev/null +++ b/treeple/tree/manifold/_kernel_splitter.pxd @@ -0,0 +1,37 @@ +import numpy as np + +from libcpp.vector cimport vector + +from ..._lib.sklearn.tree._splitter cimport SplitRecord +from ..._lib.sklearn.utils._typedefs cimport ( + float32_t, + float64_t, + int32_t, + intp_t, + uint8_t, + uint32_t, +) +from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord +from ._morf_splitter cimport PatchSplitter + + +cdef class UserKernelSplitter(PatchSplitter): + """A class to hold user-specified kernels.""" + # cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels + cdef vector[float32_t*] kernel_dictionary # A list of C-contiguous 2D kernels + cdef vector[intp_t*] kernel_dims # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + + +cdef class GaussianKernelSplitter(PatchSplitter): + """A class to hold Gaussian kernels. + + Overrides the weights that are generated to be sampled from a Gaussian distribution. + See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus + See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567 + """ + + cdef void sample_proj_mat( + self, + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices + ) noexcept nogil diff --git a/treeple/tree/manifold/_kernel_splitter.pyx b/treeple/tree/manifold/_kernel_splitter.pyx new file mode 100644 index 000000000..ee8daf801 --- /dev/null +++ b/treeple/tree/manifold/_kernel_splitter.pyx @@ -0,0 +1,351 @@ +import numpy as np + +cimport numpy as cnp +from cython.operator cimport dereference as deref +from libcpp.vector cimport vector + +from ..._lib.sklearn.tree._criterion cimport Criterion +from ..._lib.sklearn.tree._utils cimport RAND_R_MAX, rand_int +from ..._lib.sklearn.utils._typedefs cimport float32_t, int32_t +from .._utils cimport ravel_multi_index_cython, unravel_index_cython + +ctypedef float32_t* float32_t_ptr +ctypedef intp_t* intp_t_ptr + + +cdef class Kernel2D: + cdef const intp_t[:] kernels_size + cdef const float32_t[:] kernel_data + cdef const int32_t[:] kernel_indices + cdef const int32_t[:] kernel_indptr + cdef uint32_t rand_r_state # sklearn_rand_r random number state + cdef object random_state # Random state + + cdef intp_t n_dims_largest + cdef intp_t n_kernels + + def __cinit__( + self, + object kernel_matrix, + cnp.ndarray kernel_sizes, + object random_state, + *argv + ): + """ + Initialize the class with a CSR sparse matrix and a 1D array of kernel sizes. + """ + self.kernels_size = kernel_sizes + self.kernel_data = kernel_matrix.data + self.kernel_indices = kernel_matrix.indices + self.kernel_indptr = kernel_matrix.indptr + self.n_kernels, self.n_dims_largest = kernel_matrix.shape + self.random_state = random_state + self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) + + cdef inline float32_t apply_kernel( + self, + float32_t[:, ::1] image, + intp_t kernel_idx + ) noexcept nogil: + """ + Apply a kernel to a random point on a 2D image and return the resulting value. + + Parameters: + ----------- + image : double[:, :] + The 2D image array (image_height, image_width). + kernel_idx : intp_t + The index of the kernel to apply. + + Returns: + -------- + result : double + The resulting value after applying the kernel. + """ + cdef uint32_t* random_state = &self.rand_r_state + cdef intp_t img_height = image.shape[0] + cdef intp_t img_width = image.shape[1] + cdef intp_t size = self.kernels_size[kernel_idx] + + # Ensure the kernel fits within the image dimensions + cdef intp_t max_x = img_width - 1 + cdef intp_t max_y = img_height - 1 + + # Sample a random top-left point to apply the kernel + cdef intp_t start_x = rand_int(0, max_x + 1, random_state) + cdef intp_t start_y = rand_int(0, max_y + 1, random_state) + + # Compute the result + cdef float32_t result = 0.0 + cdef intp_t idx, row, col + cdef float32_t image_value, kernel_value + + cdef intp_t start = self.kernel_indptr[kernel_idx] + cdef intp_t end = self.kernel_indptr[kernel_idx + 1] + + for idx in range(start, end): + row = self.kernel_indices[idx] // size + col = self.kernel_indices[idx] % size + + # Only process the kernel if it's within the image bounds + if image_value and kernel_value and (start_y + row <= max_y) and (start_x + col <= max_x): + image_value = image[start_y + row, start_x + col] + kernel_value = self.kernel_data[idx] + result += image_value * kernel_value + + return result + + def apply_kernel_py(self, float32_t[:, ::1] image, intp_t kernel_idx): + """ + Python-accessible wrapper for apply_kernel_nogil. + """ + return self.apply_kernel(image, kernel_idx) + + + + +# cdef class UserKernelSplitter(PatchSplitter): +# def __cinit__( +# self, +# criterion: Criterion, +# max_features: intp_t, +# min_samples_leaf: intp_t, +# min_weight_leaf: double, +# random_state: object, +# min_patch_dims: intp_t, +# max_patch_dims: intp_t, +# dim_contiguous: cnp.uint8_t, +# data_dims: intp_t, +# boundary: str, +# feature_weight: float32_t, +# kernel_dictionary: Kernel2D, +# *argv +# ): +# # initialize the kernel dictionary into a vector to allow Cython to see it +# # see: https://stackoverflow.com/questions/46240207/passing-list-of-numpy-arrays-to-c-using-cython +# cdef intp_t n_arrays = len(kernel_dictionary) +# self.kernel_dictionary = vector[float32_t_ptr](n_arrays) # A list of C-contiguous 2D kernels +# self.kernel_dims = vector[intp_t_ptr](n_arrays) # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + +# # buffers to point to each element in the list +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# cdef intp_t i +# for i in range(n_arrays): +# kernel = kernel_dictionary[i] +# kernel_dim = kernel_dims[i] + +# # store a pointer to the data +# self.kernel_dictionary.push_back(&kernel[0]) +# self.kernel_dims.push_back(&kernel_dim[0]) + +# cdef inline void compute_features_over_samples( +# self, +# intp_t start, +# intp_t end, +# const intp_t[:] samples, +# float32_t[:] feature_values, +# vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) +# vector[intp_t]* proj_vec_indices # indices of the features (max_features,) +# ) noexcept nogil: +# """Compute the feature values for the samples[start:end] range. + +# Returns -1 in case of failure to allocate memory (and raise MemoryError) +# or 0 otherwise. +# """ +# cdef intp_t idx, jdx +# cdef intp_t col_idx +# cdef float32_t col_weight + +# # Compute linear combination of features and then +# # sort samples according to the feature values. +# for jdx in range(0, proj_vec_indices.size()): +# col_idx = deref(proj_vec_indices)[jdx] +# col_weight = deref(proj_vec_weights)[jdx] + +# for idx in range(start, end): +# # initialize the feature value to 0 +# if jdx == 0: +# feature_values[idx] = 0.0 +# feature_values[idx] += self.X[samples[idx], col_idx] * col_weight + + +# cdef void sample_proj_mat( +# self, +# vector[vector[float32_t]]& proj_mat_weights, +# vector[vector[intp_t]]& proj_mat_indices +# ) noexcept nogil: +# """Sample projection matrix using a contiguous patch. +# Randomly sample patches with weight of 1. +# """ +# cdef intp_t max_features = self.max_features +# cdef intp_t proj_i + +# # define parameters for vectorized points in the original data shape +# # and top-left seed +# cdef intp_t top_left_patch_seed + +# # size of the sampled patch, which is just the size of the n-dim patch +# # (\prod_i self.patch_dims_buff[i]) +# cdef intp_t patch_size + +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# for proj_i in range(0, max_features): +# # now get the top-left seed that is used to then determine the top-left +# # position in patch +# # compute top-left seed for the multi-dimensional patch +# top_left_patch_seed, patch_size = self.sample_top_left_seed() + +# # sample a random index in the kernel library +# kernel_idx = rand_int(0, self.kernel_dictionary.size(), &self.rand_r_state) + +# # get that kernel and add it to the projection vector indices and weights +# kernel = self.kernel_dictionary[kernel_idx] +# kernel_dim = self.kernel_dims[kernel_idx] + +# # convert top-left-patch-seed to unraveled indices +# # get the top-left index in the original data +# top_left_idx = self.unravel_index(top_left_patch_seed, self.data_dims_buff, self.ndim) + +# # loop over the kernel and add the weights and indices to the projection +# for idim in range(self.ndim): +# # get the dimension of the kernel +# kernel_dim = self.kernel_dims[kernel_idx][idim] + +# # get the top-left index in the kernel +# top_left_kernel_idx = self.unravel_index(top_left_patch_seed, kernel_dim, self.ndim) + + # loop over the kernel and add the weights and indices to the projection + # for i in range(kernel_dim): + # # get the index in the original data + # idx = self.ravel_multi_index(top_left_idx, self.data_dims_buff, self.ndim) + + # # get the index in the kernel + # kernel_idx = self.ravel_multi_index(top_left_kernel_idx, kernel_dim, self.ndim) + + # # add the weight and index to the projection matrix + # proj_mat_weights[proj_i].push_back(kernel[kernel_idx]) + # proj_mat_indices[proj_i].push_back(idx) + + # # increment the top-left index in the original data + # top_left_idx[idim] += 1 + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += 1 + + # # increment the top-left index in the original data + # top_left_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + + +# cdef class UserKernelSplitter(PatchSplitter): +# def __cinit__( +# self, +# criterion: Criterion, +# max_features: intp_t, +# min_samples_leaf: intp_t, +# min_weight_leaf: double, +# random_state: object, +# min_patch_dims: intp_t, +# max_patch_dims: intp_t, +# dim_contiguous: cnp.uint8_t, +# data_dims: intp_t, +# boundary: str, +# feature_weight: float32_t, +# kernel_dictionary: Kernel2D, +# *argv +# ): +# # initialize the kernel dictionary into a vector to allow Cython to see it +# # see: https://stackoverflow.com/questions/46240207/passing-list-of-numpy-arrays-to-c-using-cython +# cdef intp_t n_arrays = len(kernel_dictionary) +# self.kernel_dictionary = vector[float32_t_ptr](n_arrays) # A list of C-contiguous 2D kernels +# self.kernel_dims = vector[intp_t_ptr](n_arrays) # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + +# # buffers to point to each element in the list +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# cdef intp_t i +# for i in range(n_arrays): +# kernel = kernel_dictionary[i] +# kernel_dim = kernel_dims[i] + +# # store a pointer to the data +# self.kernel_dictionary.push_back(&kernel[0]) +# self.kernel_dims.push_back(&kernel_dim[0]) + +# cdef void sample_proj_mat( +# self, +# vector[vector[float32_t]]& proj_mat_weights, +# vector[vector[intp_t]]& proj_mat_indices +# ) noexcept nogil: +# """Sample projection matrix using a contiguous patch. +# Randomly sample patches with weight of 1. +# """ +# cdef intp_t max_features = self.max_features +# cdef intp_t proj_i + +# # define parameters for vectorized points in the original data shape +# # and top-left seed +# cdef intp_t top_left_patch_seed + +# # size of the sampled patch, which is just the size of the n-dim patch +# # (\prod_i self.patch_dims_buff[i]) +# cdef intp_t patch_size + +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# for proj_i in range(0, max_features): +# # now get the top-left seed that is used to then determine the top-left +# # position in patch +# # compute top-left seed for the multi-dimensional patch +# top_left_patch_seed, patch_size = self.sample_top_left_seed() + +# # sample a random index in the kernel library +# # kernel_idx = + +# # get that kernel and add it to the projection vector indices and weights +# kernel = self.kernel_dictionary[kernel_idx] +# kernel_dim = self.kernel_dims[kernel_idx] + +# # convert top-left-patch-seed to unraveled indices +# # get the top-left index in the original data +# top_left_idx = self.unravel_index(top_left_patch_seed, self.data_dims_buff, self.ndim) + +# # loop over the kernel and add the weights and indices to the projection +# for idim in range(self.ndim): +# # get the dimension of the kernel +# kernel_dim = self.kernel_dims[kernel_idx][idim] + +# # get the top-left index in the kernel +# top_left_kernel_idx = self.unravel_index(top_left_patch_seed, kernel_dim, self.ndim) + +# # loop over the kernel and add the weights and indices to the projection +# # for i in range(kernel_dim): +# # # get the index in the original data +# # idx = self.ravel_multi_index(top_left_idx, self.data_dims_buff, self.ndim) + +# # # get the index in the kernel +# # kernel_idx = self.ravel_multi_index(top_left_kernel_idx, kernel_dim, self.ndim) + +# # # add the weight and index to the projection matrix +# # proj_mat_weights[proj_i].push_back(kernel[kernel_idx]) +# # proj_mat_indices[proj_i].push_back(idx) + +# # # increment the top-left index in the original data +# # top_left_idx[idim] += 1 + +# # # increment the top-left index in the kernel +# # top_left_kernel_idx[idim] += 1 + +# # # increment the top-left index in the original data +# # top_left_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + +# # # increment the top-left index in the kernel +# # top_left_kernel_idx[idim] += self.patch_dims_buff[idim] - kernel_dim diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd index ae5135980..a9618dfe5 100644 --- a/treeple/tree/manifold/_morf_splitter.pxd +++ b/treeple/tree/manifold/_morf_splitter.pxd @@ -71,28 +71,3 @@ cdef class PatchSplitter(BestObliqueSplitter): vector[vector[float32_t]]& proj_mat_weights, vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil - - -ctypedef float32_t* float32_t_ptr -ctypedef intp_t* inpt_t_ptr - -cdef class UserKernelSplitter(PatchSplitter): - """A class to hold user-specified kernels.""" - # cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels - cdef vector[float32_t*] kernel_dictionary # A list of C-contiguous 2D kernels - cdef vector[intp_t*] kernel_dims # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` - - -cdef class GaussianKernelSplitter(PatchSplitter): - """A class to hold Gaussian kernels. - - Overrides the weights that are generated to be sampled from a Gaussian distribution. - See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus - See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567 - """ - - cdef void sample_proj_mat( - self, - vector[vector[float32_t]]& proj_mat_weights, - vector[vector[intp_t]]& proj_mat_indices - ) noexcept nogil diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index a238f46e1..1e91ad8f7 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -192,7 +192,9 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.feature_weight.base if self.feature_weight is not None else None, ), self.__getstate__()) - cdef (intp_t, intp_t) sample_top_left_seed(self) noexcept nogil: + cdef inline (intp_t, intp_t) sample_top_left_seed( + self + ) noexcept nogil: """Sample the top-left seed for the n-dim patch. Returns @@ -213,10 +215,11 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # define parameters for the random patch cdef intp_t patch_dim cdef intp_t delta_patch_dim - cdef intp_t dim - cdef intp_t idx + + # create a vector to store the unraveled patch top left seed point + cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim) for idx in range(self.ndim): # compute random patch width and height @@ -258,10 +261,10 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # Convert the top-left-seed value to it's appropriate index in the full image. top_left_patch_seed = max(0, dim - patch_dim + 1) - self.unraveled_patch_point[idx] = top_left_patch_seed + unraveled_patch_point[idx] = top_left_patch_seed top_left_patch_seed = ravel_multi_index_cython( - self.unraveled_patch_point, + unraveled_patch_point, self.data_dims ) return top_left_patch_seed, patch_size @@ -521,112 +524,3 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): Whether or not a feature has missing values. """ self.init(X, y, sample_weight, missing_values_in_feature_mask) - - -cdef class UserKernelSplitter(PatchSplitter): - def __cinit__( - self, - criterion: Criterion, - max_features: inpt_t, - min_samples_leaf: inpt_t, - min_weight_leaf: double, - random_state: object, - min_patch_dims: inpt_t, - max_patch_dims: inpt_t, - dim_contiguous: cnp.uint8_t, - data_dims: inpt_t, - boundary: str, - feature_weight: float32_t, - kernel_dictionary: object, - kernel_dims: object, - *argv - ): - # initialize the kernel dictionary into a vector to allow Cython to see it - # see: https://stackoverflow.com/questions/46240207/passing-list-of-numpy-arrays-to-c-using-cython - cdef int n_arrays = len(kernel_dictionary) - self.kernel_dictionary = vector[float32_t_ptr](n_arrays) # A list of C-contiguous 2D kernels - self.kernel_dims = vector[inpt_t_ptr](n_arrays) # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` - - # buffers to point to each element in the list - cdef float32_t[:] kernel - cdef inpt_t[:] kernel_dim - - cdef int i - for i in range(n_arrays): - kernel = kernel_dictionary[i] - kernel_dim = kernel_dims[i] - - # store a pointer to the data - self.kernel_dictionary.push_back(&kernel[0]) - self.kernel_dims.push_back(&kernel_dim[0]) - - cdef void sample_proj_mat( - self, - vector[vector[float32_t]]& proj_mat_weights, - vector[vector[inpt_t]]& proj_mat_indices - ) noexcept nogil: - """Sample projection matrix using a contiguous patch. - Randomly sample patches with weight of 1. - """ - cdef inpt_t max_features = self.max_features - cdef int proj_i - - # define parameters for vectorized points in the original data shape - # and top-left seed - cdef inpt_t top_left_patch_seed - - # size of the sampled patch, which is just the size of the n-dim patch - # (\prod_i self.patch_dims_buff[i]) - cdef inpt_t patch_size - - cdef float32_t[:] kernel - cdef inpt_t[:] kernel_dim - - for proj_i in range(0, max_features): - # now get the top-left seed that is used to then determine the top-left - # position in patch - # compute top-left seed for the multi-dimensional patch - top_left_patch_seed, patch_size = self.sample_top_left_seed() - - # sample a random index in the kernel library - # kernel_idx = - - # get that kernel and add it to the projection vector indices and weights - kernel = self.kernel_dictionary[kernel_idx] - kernel_dim = self.kernel_dims[kernel_idx] - - # convert top-left-patch-seed to unraveled indices - # get the top-left index in the original data - top_left_idx = self.unravel_index(top_left_patch_seed, self.data_dims_buff, self.ndim) - - # loop over the kernel and add the weights and indices to the projection - for idim in range(self.ndim): - # get the dimension of the kernel - kernel_dim = self.kernel_dims[kernel_idx][idim] - - # get the top-left index in the kernel - top_left_kernel_idx = self.unravel_index(top_left_patch_seed, kernel_dim, self.ndim) - - # loop over the kernel and add the weights and indices to the projection - # for i in range(kernel_dim): - # # get the index in the original data - # idx = self.ravel_multi_index(top_left_idx, self.data_dims_buff, self.ndim) - - # # get the index in the kernel - # kernel_idx = self.ravel_multi_index(top_left_kernel_idx, kernel_dim, self.ndim) - - # # add the weight and index to the projection matrix - # proj_mat_weights[proj_i].push_back(kernel[kernel_idx]) - # proj_mat_indices[proj_i].push_back(idx) - - # # increment the top-left index in the original data - # top_left_idx[idim] += 1 - - # # increment the top-left index in the kernel - # top_left_kernel_idx[idim] += 1 - - # # increment the top-left index in the original data - # top_left_idx[idim] += self.patch_dims_buff[idim] - kernel_dim - - # # increment the top-left index in the kernel - # top_left_kernel_idx[idim] += self.patch_dims_buff[idim] - kernel_dim diff --git a/treeple/tree/manifold/meson.build b/treeple/tree/manifold/meson.build index 867da8eaa..6b900aec6 100644 --- a/treeple/tree/manifold/meson.build +++ b/treeple/tree/manifold/meson.build @@ -2,6 +2,9 @@ tree_extension_metadata = { '_morf_splitter': {'sources': ['_morf_splitter.pyx'], 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_kernel_splitter': + {'sources': ['_kernel_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, } foreach ext_name, ext_dict : tree_extension_metadata diff --git a/treeple/tree/tests/meson.build b/treeple/tree/tests/meson.build index 552eb7356..b0ccb9b5e 100644 --- a/treeple/tree/tests/meson.build +++ b/treeple/tree/tests/meson.build @@ -7,6 +7,7 @@ python_sources = [ 'test_all_trees.py', 'test_unsupervised_tree.py', 'test_multiview.py', + 'test_kernel.py', ] py.install_sources( diff --git a/treeple/tree/tests/test_kernel.py b/treeple/tree/tests/test_kernel.py new file mode 100644 index 000000000..6e18b276c --- /dev/null +++ b/treeple/tree/tests/test_kernel.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from treeple.tree._kernel import sample_gaussian_kernels + + +def gaussian_2d(x, y, mu, sigma): + """Generate the 2D Gaussian value at point (x, y). + + Used to generate the expected Gaussian kernel for testing. + """ + return np.exp(-((x - mu) ** 2 + (y - mu) ** 2) / (2 * sigma**2)) + + +@pytest.mark.parametrize( + "n_kernels, min_size, max_size, mu, sigma", + [ + (1, 3, 3, 0.0, 1.0), # Single fixed kernel + (2, 3, 5, 0.5, 1.1), # Two fixed-size kernels with varying sigma + (3, 3, 5, (0.0, 0.5), (0.1, 1.0)), # Varying sizes, fixed sigma + ], +) +def test_sample_gaussian_kernels(n_kernels, min_size, max_size, mu, sigma): + kernel_matrix, kernel_params = sample_gaussian_kernels(n_kernels, min_size, max_size, mu, sigma) + + # Ensure kernel_matrix and kernel_sizes have correct lengths + assert kernel_matrix.shape[0] == n_kernels + assert kernel_params["size"].shape[0] == n_kernels + + for i in range(n_kernels): + size = kernel_params["size"][i] + mu_sample = kernel_params["mu"][i] + sigma_sample = kernel_params["sigma"][i] + + # Extract and reshape the kernel + start_idx = kernel_matrix.indptr[i] + end_idx = kernel_matrix.indptr[i + 1] + kernel_vector = kernel_matrix.data[start_idx:end_idx] + kernel = kernel_vector.reshape(size, size) + + # Generate the expected Gaussian kernel + x = np.linspace(-1, 1, size) + y = np.linspace(-1, 1, size) + X, Y = np.meshgrid(x, y) + expected_kernel = gaussian_2d(X, Y, mu_sample, sigma_sample) + + # Normalize the expected kernel for comparison + expected_kernel /= expected_kernel.sum() + kernel /= kernel.sum() + + # Check that the kernel matches the expected Gaussian distribution + np.testing.assert_allclose(kernel, expected_kernel, rtol=1e-2, atol=1e-2) From 6fe71a5f0e64466c9022358b91c79fdeb4c297ed Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Aug 2024 16:25:55 -0700 Subject: [PATCH 6/7] Updated morf and utils code --- treeple/tree/_oblique_splitter.pxd | 6 - treeple/tree/_oblique_splitter.pyx | 80 ++-- treeple/tree/_utils.pxd | 30 +- treeple/tree/_utils.pyx | 224 ++++++--- treeple/tree/manifold/_morf_splitter.pxd | 22 +- treeple/tree/manifold/_morf_splitter.pyx | 585 +++++++++++++++++++---- treeple/tree/tests/meson.build | 1 + treeple/tree/tests/test_manifold.py | 89 ++++ 8 files changed, 829 insertions(+), 208 deletions(-) create mode 100644 treeple/tree/tests/test_manifold.py diff --git a/treeple/tree/_oblique_splitter.pxd b/treeple/tree/_oblique_splitter.pxd index 124a66dd6..65ca16e14 100644 --- a/treeple/tree/_oblique_splitter.pxd +++ b/treeple/tree/_oblique_splitter.pxd @@ -83,12 +83,6 @@ cdef class BaseObliqueSplitter(Splitter): SplitRecord* split, ) except -1 nogil - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state - ) noexcept nogil cdef class ObliqueSplitter(BaseObliqueSplitter): # The splitter searches in the input space for a linear combination of features and a threshold diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index cb221643f..78451b74d 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -11,6 +11,7 @@ from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform +from ._utils cimport fisher_yates_shuffle cdef float64_t INFINITY = np.inf @@ -46,8 +47,12 @@ cdef class BaseObliqueSplitter(Splitter): def __setstate__(self, d): pass - cdef int node_reset(self, intp_t start, intp_t end, - float64_t* weighted_n_node_samples) except -1 nogil: + cdef int node_reset( + self, + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples + ) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -62,17 +67,7 @@ cdef class BaseObliqueSplitter(Splitter): weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ - - self.start = start - self.end = end - - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples) - self.criterion.set_sample_pointers(start, end) - - weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples + Splitter.node_reset(self, start, end, weighted_n_node_samples) # Clear all projection vectors for i in range(self.max_features): @@ -102,8 +97,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]* proj_vec_weights, # weights of the vector for this projection (n_non_zeros',) + vector[intp_t]* proj_vec_indices # indices of the features for this projection (n_non_zeros',) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -126,20 +121,6 @@ cdef class BaseObliqueSplitter(Splitter): feature_values[idx] = 0.0 feature_values[idx] += self.X[samples[idx], col_idx] * col_weight - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state, - ) noexcept nogil: - cdef intp_t i, j - - # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? - for i in range(0, grid_size - 1): - j = rand_int(i, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - cdef class ObliqueSplitter(BaseObliqueSplitter): def __cinit__( self, @@ -220,6 +201,43 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # self.feature_weights = np.ones((self.n_features,), dtype=float32_t) / self.n_features return 0 + cdef void sample_proj_vec( + self, + vector[float32_t]& proj_vec_weights, + vector[intp_t]& proj_vec_indices + ) noexcept nogil: + cdef intp_t n_features = self.n_features + cdef intp_t n_non_zeros = self.n_non_zeros + cdef uint32_t* random_state = &self.rand_r_state + + cdef intp_t i, feat_i, proj_i, rand_vec_index + cdef float32_t weight + + # construct an array to sample from mTry x n_features set of indices + cdef intp_t[::1] indices_to_sample = self.indices_to_sample + cdef intp_t grid_size = self.max_features * self.n_features + + # shuffle indices over the 2D grid to sample using Fisher-Yates + fisher_yates_shuffle(indices_to_sample, grid_size, random_state) + + # sample 'n_non_zeros' in a mtry X n_features projection matrix + # which consists of +/- 1's chosen at a 1/2s rate + for i in range(0, n_non_zeros): + # get the next index from the shuffled index array + rand_vec_index = indices_to_sample[i] + + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + proj_i = rand_vec_index // n_features + feat_i = rand_vec_index % n_features + + # sample a random weight + weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + + proj_vec_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_vec_weights[proj_i].push_back(weight) # Store weight of nonzero + + cdef void sample_proj_mat( self, vector[vector[float32_t]]& proj_mat_weights, @@ -257,7 +275,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef intp_t grid_size = self.max_features * self.n_features # shuffle indices over the 2D grid to sample using Fisher-Yates - self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state) + fisher_yates_shuffle(indices_to_sample, grid_size, random_state) # sample 'n_non_zeros' in a mtry X n_features projection matrix # which consists of +/- 1's chosen at a 1/2s rate @@ -340,6 +358,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # XXX: 'feature' is not actually used in oblique split records # Just indicates which split was sampled current_split.feature = feat_i + + # sample the projection vector current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index a436c0599..8a5957518 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -4,9 +4,23 @@ cimport numpy as cnp cnp.import_array() +from libcpp.vector cimport vector + from .._lib.sklearn.tree._splitter cimport SplitRecord from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t +ctypedef fused vector_or_memview: + vector[intp_t] + intp_t[::1] + intp_t[:] + + +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil + cdef intp_t rand_weighted_binary( float64_t p0, @@ -22,10 +36,22 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) cdef void unravel_index_cython( intp_t index, const intp_t[:] shape, - const intp_t[:] coords + vector_or_memview coords ) noexcept nogil cdef intp_t ravel_multi_index_cython( - const intp_t[:] coords, + vector_or_memview coords, const intp_t[:] shape ) noexcept nogil + +cdef void compute_vectorized_indices_from_cartesian( + intp_t base_index, + vector[vector[intp_t]]& index_arrays, + const intp_t[:] data_dims, + vector[intp_t]& result +) noexcept nogil + +cdef memoryview[float32_t, ndim=3] init_2dmemoryview( + cnp.ndarray array, + const intp_t[:] data_dims +) diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx index 28c038675..786ff70a3 100644 --- a/treeple/tree/_utils.pyx +++ b/treeple/tree/_utils.pyx @@ -13,7 +13,35 @@ cimport numpy as cnp cnp.import_array() -from .._lib.sklearn.tree._utils cimport rand_uniform +from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform + + +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil: + """Shuffle the indices in place using the Fisher-Yates algorithm. + + Parameters + ---------- + indices_to_sample : A C++ vector or 1D memoryview + The indices to shuffle. + grid_size : intp_t + The size of the grid to shuffle. This is explicitly passed in + to support the templated `vector_or_memview` type, which allows + for both C++ vectors and Cython memoryviews. Getitng the length + of both types uses different API. + random_state : uint32_t* + The random state. + """ + cdef intp_t i, j + + # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? + for i in range(0, grid_size - 1): + j = rand_int(i, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] cdef inline intp_t rand_weighted_binary( @@ -36,62 +64,11 @@ cdef inline intp_t rand_weighted_binary( else: return 1 -cpdef unravel_index( - intp_t index, - cnp.ndarray[intp_t, ndim=1] shape -): - """Converts a flat index or array of flat indices into a tuple of coordinate arrays. - - Purely used for testing purposes. - - Parameters - ---------- - index : intp_t - A flat index. - shape : numpy.ndarray[intp_t, ndim=1] - The shape of the array into which the flat indices should be converted. - - Returns - ------- - numpy.ndarray[intp_t, ndim=1] - A coordinate array having the same shape as the input `shape`. - """ - index = np.intp(index) - shape = np.array(shape) - coords = np.empty(shape.shape[0], dtype=np.intp) - unravel_index_cython(index, shape, coords) - return coords - - -cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): - """Converts a tuple of coordinate arrays into a flat index. - - Purely used for testing purposes. - - Parameters - ---------- - coords : numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays to be converted. - shape : numpy.ndarray[intp_t, ndim=1] - The shape of the array into which the coordinates should be converted. - - Returns - ------- - intp_t - The resulting flat index. - - Raises - ------ - ValueError - If the input `coords` have invalid indices. - """ - return ravel_multi_index_cython(coords, shape) - cdef inline void unravel_index_cython( intp_t index, const intp_t[:] shape, - const intp_t[:] coords + vector_or_memview coords ) noexcept nogil: """Converts a flat index into a tuple of coordinate arrays. @@ -101,13 +78,9 @@ cdef inline void unravel_index_cython( The flat index to be converted. shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the flat index should be converted. - coords : numpy.ndarray[intp_t, ndim=1] - A preinitialized memoryview array of coordinate arrays to be converted. - - Returns - ------- - numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays, with each coordinate array having the same shape as the input `shape`. + coords : intp_t[:] or vector[intp_t] + A preinitialized array of coordinates to store the result of the + unraveled `index`. """ cdef intp_t ndim = shape.shape[0] cdef intp_t j, size @@ -119,15 +92,15 @@ cdef inline void unravel_index_cython( cdef inline intp_t ravel_multi_index_cython( - const intp_t[:] coords, + vector_or_memview coords, const intp_t[:] shape ) noexcept nogil: """Converts a tuple of coordinate arrays into a flat index in the vectorized dimension. Parameters ---------- - coords : numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays to be converted. + coords : intp_t[:] or vector[intp_t] + An array of coordinates to be converted and vectorized into a sinlg shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the coordinates should be converted. @@ -159,6 +132,55 @@ cdef inline intp_t ravel_multi_index_cython( return flat_index +cdef inline void compute_vectorized_indices_from_cartesian( + intp_t base_index, + vector[vector[intp_t]]& index_arrays, + const intp_t[:] data_dims, + vector[intp_t]& result +) noexcept nogil: + """ + Compute the vectorized indices for all index sets from the Cartesian product of index arrays. + + :param base_index: The initial vectorized index (top-left point in the patch). + :param index_arrays: A list of lists, where each sublist contains the indices for that dimension. + :param data_dims: The dimensions of the data. + :param result: C++ vector to store the vectorized indices. + """ + cdef int n_dims = len(index_arrays) + cdef vector[intp_t] indices = vector[intp_t](n_dims) + cdef intp_t vectorized_index + cdef intp_t i + cdef intp_t n_products = 1 + + # Calculate the total number of Cartesian products + for i in range(n_dims): + n_products *= len(index_arrays[i]) + + # Initialize the current indices for each dimension + cdef vector[intp_t] current_indices = vector[intp_t](n_dims) + for i in range(n_dims): + current_indices[i] = 0 + + # Iterate over Cartesian products and compute vectorized indices + for _ in range(n_products): + # Compute the vectorized index for the current combination + for i in range(n_dims): + indices[i] = index_arrays[i][current_indices[i]] + vectorized_index = ravel_multi_index_cython( + indices, data_dims + ) + result.push_back(vectorized_index) + + # Move to the next combination + for i in range(n_dims - 1, -1, -1): + current_indices[i] += 1 + if current_indices[i] < len(index_arrays[i]): + break + current_indices[i] = 0 + if i == 0: + return # Finished all combinations + + cdef vector[vector[intp_t]] cartesian_cython( vector[vector[intp_t]] sequences ) noexcept nogil: @@ -176,4 +198,78 @@ cdef vector[vector[intp_t]] cartesian_cython( cpdef cartesian_python(vector[vector[intp_t]]& sequences): - return cartesian_cython(sequences) \ No newline at end of file + return cartesian_cython(sequences) + + +cdef memoryview[float32_t, ndim=3] init_2dmemoryview( + cnp.ndarray array, + const intp_t[:] data_dims +): + cdef intp_t n_samples = array.shape[0] + cdef intp_t ndim = data_dims.shape[0] + assert ndim == 2, f"Currently only 2D images are accepted." + + # Reshape the array into (n_samples, *data_dims) + cdef cnp.ndarray reshaped_array = array.reshape((n_samples, data_dims[0], data_dims[1])) + + # Create a memoryview with the appropriate number of dimensions + cdef memoryview[float32_t, ndim=3] view = reshaped_array + return view + +#################################################################################################### +# CPDEF functions for testing purposes +#################################################################################################### + +cpdef unravel_index( + intp_t index, + cnp.ndarray[intp_t, ndim=1] shape +): + """Converts a flat index or array of flat indices into a tuple of coordinate arrays. + + Purely used for testing purposes. + + Parameters + ---------- + index : intp_t + A flat index. + shape : numpy.ndarray[intp_t, ndim=1] + The shape of the array into which the flat indices should be converted. + + Returns + ------- + numpy.ndarray[intp_t, ndim=1] + A coordinate array having the same shape as the input `shape`. + """ + index = np.intp(index) + shape = np.array(shape, dtype=np.intp) + coords = np.empty(shape.shape[0], dtype=np.intp) + + cdef const intp_t[:] shape_memview = shape + cdef intp_t[:] coords_memview = coords + unravel_index_cython(index, shape_memview, coords_memview) + return coords + + +cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): + """Converts a tuple of coordinate arrays into a flat index. + + Purely used for testing purposes. + + Parameters + ---------- + coords : numpy.ndarray[intp_t, ndim=1] + An array of coordinate arrays to be converted. + shape : numpy.ndarray[intp_t, ndim=1] + The shape of the array into which the coordinates should be converted. + + Returns + ------- + intp_t + The resulting flat index. + + Raises + ------ + ValueError + If the input `coords` have invalid indices. + """ + return ravel_multi_index_cython(coords, shape) diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd index a9618dfe5..164b9c6f9 100644 --- a/treeple/tree/manifold/_morf_splitter.pxd +++ b/treeple/tree/manifold/_morf_splitter.pxd @@ -33,19 +33,15 @@ cdef class PatchSplitter(BestObliqueSplitter): # `data_width` are used to determine the vectorized indices corresponding to # (x,y) coordinates in the original un-vectorized data. - cdef public intp_t max_patch_height # Maximum height of the patch to sample - cdef public intp_t max_patch_width # Maximum width of the patch to sample - cdef public intp_t min_patch_height # Minimum height of the patch to sample - cdef public intp_t min_patch_width # Minimum width of the patch to sample - cdef public intp_t data_height # Height of the input data - cdef public intp_t data_width # Width of the input data - cdef public intp_t ndim # The number of dimensions of the input data + cdef const intp_t[:] data_dims # The dimensions of the input data + cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension + cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension + cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous - cdef const intp_t[:] data_dims # The dimensions of the input data - cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension - cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension - cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous + # TODO: assumes all oblique splitters only work with dense data + cdef public memoryview X_reshaped + cdef memoryview patch_nd_indices # TODO: check if this works and is necessary for discontiguous data # cdef intp_t[:] stride_offsets # The stride offsets for each dimension @@ -56,13 +52,13 @@ cdef class PatchSplitter(BestObliqueSplitter): cdef intp_t[::1] _index_data_buffer cdef intp_t[::1] _index_patch_buffer - cdef intp_t[:] patch_dims_buff # A buffer to store the dimensions of the sampled patch + cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix # to quickly obtain the sampled projections for candidate splits. - cdef (intp_t, intp_t) sample_top_left_seed( + cdef intp_t sample_top_left_seed( self ) noexcept nogil diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index 1e91ad8f7..f335999c4 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -5,6 +5,8 @@ # cython: wraparound=False # cython: initializedcheck=False +cimport numpy as cnp + import numpy as np from cython.operator cimport dereference as deref @@ -12,7 +14,24 @@ from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion from ..._lib.sklearn.tree._utils cimport rand_int -from .._utils cimport ravel_multi_index_cython, unravel_index_cython +from ..._lib.sklearn.tree._tree cimport ParentInfo +from .._utils cimport init_2dmemoryview, ravel_multi_index_cython, unravel_index_cython, fisher_yates_shuffle, compute_vectorized_indices_from_cartesian +from .._sklearn_splitter cimport sort + +cdef float64_t INFINITY = np.inf + +# Mitigate precision differences between 32 bit and 64 bit +cdef float32_t FEATURE_THRESHOLD = 1e-7 + +cdef inline void _init_split(ObliqueSplitRecord* self, intp_t start_pos) noexcept nogil: + self.impurity_left = INFINITY + self.impurity_right = INFINITY + self.pos = start_pos + self.feature = 0 + self.threshold = 0. + self.improvement = -INFINITY + self.missing_go_to_left = False + self.n_missing = 0 cdef class PatchSplitter(BestObliqueSplitter): @@ -33,46 +52,11 @@ cdef class PatchSplitter(BestObliqueSplitter): const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, ) except -1: + # store a view to the (n_samples, n_features) dataset BestObliqueSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - return 0 - - cdef int node_reset( - self, - intp_t start, - intp_t end, - float64_t* weighted_n_node_samples - ) except -1 nogil: - """Reset splitter on node samples[start:end]. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - start : intp_t - The index of the first sample to consider - end : intp_t - The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=float64_t pointer - The total weight of those samples - """ - - self.start = start - self.end = end - - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples) - self.criterion.set_sample_pointers(start, end) - - weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples - - # Clear all projection vectors - for i in range(self.max_features): - self.proj_mat_weights[i].clear() - self.proj_mat_indices[i].clear() + # store a reshaped view of (n_samples, height, width) dataset + self.X_reshaped = init_2dmemoryview(X, self.data_dims) return 0 cdef void sample_proj_mat( @@ -87,32 +71,11 @@ cdef class PatchSplitter(BestObliqueSplitter): """ pass - cdef (intp_t, intp_t) sample_top_left_seed(self) noexcept nogil: + cdef intp_t sample_top_left_seed(self) noexcept nogil: pass -cdef class BaseDensePatchSplitter(PatchSplitter): - cdef int init( - self, - object X, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - const uint8_t[::1] missing_values_in_feature_mask, - # const int32_t[:] n_categories - ) except -1: - """Initialize the splitter - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - # Call parent init - PatchSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - - self.X = X - return 0 - - -cdef class BestPatchSplitter(BaseDensePatchSplitter): +cdef class BestPatchSplitter(PatchSplitter): def __cinit__( self, Criterion criterion, @@ -151,7 +114,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.data_dims = data_dims # create a buffer for storing the patch dimensions sampled per projection matrix - self.patch_dims_buff = np.zeros(data_dims.shape[0], dtype=np.intp) + self.patch_sampled_size = np.zeros(data_dims.shape[0], dtype=np.intp) self.unraveled_patch_point = np.zeros(data_dims.shape[0], dtype=np.intp) # store the min and max patch dimension constraints @@ -159,6 +122,19 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.max_patch_dims = max_patch_dims self.dim_contiguous = dim_contiguous + # create a memoryview to store the n-dimensional indices + self.patch_nd_indices = np.zeros(self.max_patch_dims[:], dtype=np.intp) + + # store random indices for discontiguous sampling if necessary + # these are initialized here to allow fisher-yates shuffling without having + # to re-allocate memory + self.random_indices = vector[vector[intp_t]](self.ndim) + for idx in range(self.ndim): + if not self.dim_contiguous[idx]: + self.random_indices[idx].reserve(self.max_patch_dims[idx]) + for jdx in range(self.max_patch_dims[idx]): + self.random_indices[idx].push_back(jdx) + # initialize a buffer to allow for Fisher-Yates self._index_patch_buffer = np.zeros(np.max(self.max_patch_dims), dtype=np.intp) self._index_data_buffer = np.zeros(np.max(self.data_dims), dtype=np.intp) @@ -192,23 +168,167 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.feature_weight.base if self.feature_weight is not None else None, ), self.__getstate__()) - cdef inline (intp_t, intp_t) sample_top_left_seed( + cdef int node_split( + self, + ParentInfo* parent_record, + SplitRecord* split, + ) except -1 nogil: + """Find the best_split split on node samples[start:end] + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # typecast the pointer to an ObliqueSplitRecord + cdef ObliqueSplitRecord* oblique_split = (split) + + # Draw random splits and pick the best_split + cdef intp_t[::1] samples = self.samples + cdef intp_t start = self.start + cdef intp_t end = self.end + + # pointer array to store feature values to split on + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t max_features = self.max_features + cdef intp_t min_samples_leaf = self.min_samples_leaf + + # keep track of split record for current_split node and the best_split split + # found among the sampled projection vectors + cdef ObliqueSplitRecord best_split, current_split + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY + + cdef float64_t impurity = parent_record.impurity + + cdef intp_t feat_i, p # index over computed features and start/end + cdef intp_t partition_end + cdef float32_t temp_d # to compute a projection feature value + + # instantiate the split records + _init_split(&best_split, end) + + # For every vector in the projection matrix + for feat_i in range(max_features): + # Projection vector has no nonzeros + if self.proj_mat_weights[feat_i].empty(): + continue + + # XXX: 'feature' is not actually used in oblique split records + # Just indicates which split was sampled + current_split.feature = feat_i + + # sample the projection vector + self.sample_proj_vec(self.proj_mat_weights[feat_i], self.proj_mat_indices[feat_i]) + current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] + current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] + + # Compute linear combination of features and then + # sort samples according to the feature values. + self.compute_features_over_samples( + start, + end, + samples, + feature_values, + &self.proj_mat_weights[feat_i], + &self.proj_mat_indices[feat_i] + ) + + # Sort the samples + sort(&feature_values[start], &samples[start], end - start) + + # Evaluate all splits + self.criterion.reset() + p = start + while p < end: + while (p + 1 < end and feature_values[p + 1] <= feature_values[p] + FEATURE_THRESHOLD): + p += 1 + + p += 1 + + if p < end: + current_split.pos = p + + # Reject if min_samples_leaf is not guaranteed + if (((current_split.pos - start) < min_samples_leaf) or + ((end - current_split.pos) < min_samples_leaf)): + continue + + self.criterion.update(current_split.pos) + # Reject if min_weight_leaf is not satisfied + if self.check_postsplit_conditions() == 1: + continue + + current_proxy_improvement = \ + self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + # sum of halves is used to avoid infinite value + current_split.threshold = feature_values[p - 1] / 2.0 + feature_values[p] / 2.0 + + if ( + (current_split.threshold == feature_values[p]) or + (current_split.threshold == INFINITY) or + (current_split.threshold == -INFINITY) + ): + current_split.threshold = feature_values[p - 1] + + best_split = current_split # copy + + # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] + if best_split.pos < end: + partition_end = end + p = start + + while p < partition_end: + # Account for projection vector + temp_d = 0.0 + for j in range(best_split.proj_vec_indices.size()): + temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ + deref(best_split.proj_vec_weights)[j] + + if temp_d <= best_split.threshold: + p += 1 + + else: + partition_end -= 1 + samples[p], samples[partition_end] = \ + samples[partition_end], samples[p] + + self.criterion.reset() + self.criterion.update(best_split.pos) + self.criterion.children_impurity(&best_split.impurity_left, + &best_split.impurity_right) + best_split.improvement = self.criterion.impurity_improvement( + impurity, best_split.impurity_left, best_split.impurity_right) + + # Return values + deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices + deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + deref(oblique_split).feature = best_split.feature + deref(oblique_split).pos = best_split.pos + deref(oblique_split).threshold = best_split.threshold + deref(oblique_split).improvement = best_split.improvement + deref(oblique_split).impurity_left = best_split.impurity_left + deref(oblique_split).impurity_right = best_split.impurity_right + + # XXX: Fix when we can track constants + parent_record.n_constant_features = 0 + return 0 + + cdef inline intp_t sample_top_left_seed( self ) noexcept nogil: - """Sample the top-left seed for the n-dim patch. + """Sample the top-left seed, and patch size for the n-dim patch. Returns ------- top_left_seed : intp_t The top-left seed vectorized (i.e. raveled) for the n-dim patch. - patch_size : intp_t - The total size of the n-dim patch (i.e. the volume). """ # now get the top-left seed that is used to then determine the top-left # position in patch # compute top-left seed for the multi-dimensional patch cdef intp_t top_left_patch_seed - cdef intp_t patch_size = 1 cdef uint32_t* random_state = &self.rand_r_state @@ -240,8 +360,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): top_left_patch_seed = rand_int(0, delta_patch_dim, random_state) # write to buffer - self.patch_dims_buff[idx] = patch_dim - patch_size *= patch_dim + self.patch_sampled_size[idx] = patch_dim elif self.boundary == "wrap": # add circular boundary conditions delta_patch_dim = self.data_dims[idx] + 2 * (patch_dim - 1) @@ -254,20 +373,94 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # resample the patch dimension due to padding patch_dim = min(patch_dim, min(dim+1, self.data_dims[idx] + patch_dim - dim - 1)) - self.patch_dims_buff[idx] = patch_dim - patch_size *= patch_dim + self.patch_sampled_size[idx] = patch_dim # TODO: make this work # Convert the top-left-seed value to it's appropriate index in the full image. top_left_patch_seed = max(0, dim - patch_dim + 1) - unraveled_patch_point[idx] = top_left_patch_seed + # unraveled_patch_point[idx] = top_left_patch_seed + self.unraveled_patch_point[idx] = top_left_patch_seed + + # top_left_patch_seed = ravel_multi_index_cython( + # unraveled_patch_point, + # self.data_dims + # ) + top_left_patch_seed = ravel_multi_index_cython( + self.unraveled_patch_point, + self.data_dims + ) + return top_left_patch_seed + + + cdef inline intp_t sample_top_left_seed_v2( + self + ) noexcept nogil: + """Sample the top-left seed, and patch size for the n-dim patch. + + Returns + ------- + top_left_seed : intp_t + The top-left seed vectorized (i.e. raveled) for the n-dim patch. + """ + # the top-left position in the patch for each dimension + cdef intp_t top_left_patch_seed + + cdef uint32_t* random_state = &self.rand_r_state + + # define parameters for the random patch + cdef intp_t patch_dim + cdef intp_t delta_patch_dim + cdef intp_t dim + + # pre-allocated buffer to store the unraveled patch top left seed point + cdef intp_t[:] unraveled_patch_point = self.unraveled_patch_point + + for dim in range(self.ndim): + # compute random patch width and height + # Note: By constraining max patch height/width to be at least the min + # patch height/width this ensures that the minimum value of + # patch_height and patch_width is 1 + patch_dim = rand_int( + self.min_patch_dims[dim], + self.max_patch_dims[dim] + 1, + random_state + ) + + if not self.dim_contiguous[dim]: + # fisher-yates shuffle the discontiguous dimension, so we can randomly sample + # without replacement, the indices of the patch in that dimension + fisher_yates_shuffle(self.random_indices[dim], self.random_indices[dim].size(), random_state) + + # sample the top-left index and patch size for this dimension based on boundary effects + if self.boundary is None: + # compute the difference between the image dimensions and the current + # random patch dimensions for sampling + delta_patch_dim = (self.data_dims[dim] - patch_dim) + 1 + top_left_patch_seed = rand_int(0, delta_patch_dim, random_state) + + # write to buffer + self.patch_sampled_size[dim] = patch_dim + elif self.boundary == "wrap": + pass + + # now add the relevant indices for each element of the patch + for jdx in range(patch_dim): + if self.dim_contiguous[dim]: + self.patch_nd_indices[dim][jdx] = top_left_patch_seed + jdx + else: + # if discontiguous, we will perform random sampling + self.patch_nd_indices[dim][jdx] = self.random_indices[dim][jdx] + + unraveled_patch_point[dim] = top_left_patch_seed + + # get the vectorized index of the top-left seed top_left_patch_seed = ravel_multi_index_cython( unraveled_patch_point, self.data_dims ) - return top_left_patch_seed, patch_size + return top_left_patch_seed cdef void sample_proj_mat( self, @@ -285,24 +478,19 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # and top-left seed cdef intp_t top_left_patch_seed - # size of the sampled patch, which is just the size of the n-dim patch - # (\prod_i self.patch_dims_buff[i]) - cdef intp_t patch_size - for proj_i in range(0, max_features): # now get the top-left seed that is used to then determine the top-left # position in patch # compute top-left seed for the multi-dimensional patch - top_left_patch_seed, patch_size = self.sample_top_left_seed() + top_left_patch_seed = self.sample_top_left_seed() # sample a projection vector given the top-left seed point in n-dimensional space self.sample_proj_vec( proj_mat_weights, proj_mat_indices, proj_i, - patch_size, top_left_patch_seed, - self.patch_dims_buff + self.patch_sampled_size ) cdef void sample_proj_vec( @@ -310,7 +498,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): vector[vector[float32_t]]& proj_mat_weights, vector[vector[intp_t]]& proj_mat_indices, intp_t proj_i, - intp_t patch_size, intp_t top_left_patch_seed, const intp_t[:] patch_dims, ) noexcept nogil: @@ -318,6 +505,14 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # iterates over the size of the patch cdef intp_t patch_idx + cdef intp_t i + + # size of the sampled patch, which is just the size of the n-dim patch + # (\prod_i self.patch_sampled_size[i]) + cdef intp_t patch_size = 1 + for i in range(0, self.ndim): + patch_size *= patch_dims[i] + # stores how many patches we have iterated so far cdef intp_t vectorized_patch_offset cdef intp_t vectorized_point_offset @@ -333,7 +528,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef intp_t other_dims_offset cdef intp_t row_index - cdef intp_t i cdef intp_t num_rows = self.data_dims[0] if self._discontiguous: # fill with values 0, 1, ..., dimension - 1 @@ -392,7 +586,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): if not self.dim_contiguous[idx]: row_index += ( (self.unraveled_patch_point[idx] // other_dims_offset) % - self.patch_dims_buff[idx] + self.patch_sampled_size[idx] ) * other_dims_offset other_dims_offset //= self.data_dims[idx] @@ -404,6 +598,166 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): proj_mat_indices[proj_i].push_back(vectorized_point) proj_mat_weights[proj_i].push_back(weight) + cdef void sample_proj_vec_v2( + self, + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices, + vector[intp_t]& top_left_seed, + intp_t proj_i, + const intp_t[:] patch_dims, + ) noexcept nogil: + cdef uint32_t* random_state = &self.rand_r_state + + # iterates over the size of the patch + cdef intp_t patch_idx + + # stores how many patches we have iterated so far + cdef intp_t vectorized_patch_offset + cdef intp_t vectorized_point_offset + cdef intp_t vectorized_point + + cdef intp_t dim_idx + + # weights are default to 1 + cdef float32_t weight = 1. + + # XXX: still unsure if it works yet + # XXX: THIS ONLY WORKS FOR THE FIRST DIMENSION THAT IS DISCONTIGUOUS. + cdef intp_t other_dims_offset + cdef intp_t row_index + + cdef intp_t i + cdef intp_t num_rows = self.data_dims[0] + + # iterate over the patch and store the vectorized index + cdef vector[intp_t] index_arr = vector[intp_t](self.ndim) + for dim in range(top_left_seed.size()): + index_arr[dim] = top_left_seed[dim] + + # fisher_yates_shuffle(dim_index_arr, patch_dims[dim], random_state) + + # iterate over the indices of the patch, and get the resulting + # raveled index at each patch-point, resulting in now a 1D vector + # of indices representing each patch-point. + while True: + # Note: we iterate over the last axis first assuming that data + # is stored in a C-contiguous array. + for dim in range(self.ndim - 1, -1, -1): + if self.dim_contiguous[dim]: + index_arr[dim] += 1 + + # check we have not reached the boundaries of the patch. If we have not, + # we can break out of the loop and continue iterating in this dimension + if index_arr[dim] < top_left_seed[dim] + patch_dims[dim]: + break + + # if we have reached the boundary, we reset the index and continue + index_arr[dim] = top_left_seed[dim] + else: + random_index = random_indices[dim][r_idx] + # dimension is assumed discontiguous, and thus is randomly chosen + index_arr[dim] = rand_int(0, self.data_dims[dim], random_state) + + discontiguous_index += 1 + + if r_idx < patch_dims[dim]: + idx[dim] = random_indices[dim][r_idx] + r_idx += 1 + break + + r_idx = 0 + # if we have reached the boundary, we reset the index and continue + index_arr[dim] = top_left_seed[dim] + + # if self.dim_contiguous[dim] and + # break + # elif not self.dim_contiguous[dim] and index > patch_dims[dim]: + # break + else: + break + + # # get the vectorized version of this index + # # dim_idx = ravel_multi_index_cython( + # # index_arr, + # # self.data_dims + # # ) + + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + + # idx[dim] = start[dim] + # TODO: iterate correctly over multi-dim array + # for idx in range(len(self.data_dims[dim])): + # unvectorized_idx[dim] = top_left_seed[dim] + idx + # else: + # unvectorized_idx[dim] = top_left_seed[dim] + rand_int(0, patch_dims[dim], random_state) + + # # get the vectorized version of this index + # dim_idx = ravel_multi_index_cython( + # unvectorized_idx, + # self.data_dims + # ) + + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + + + # for patch_idx in range(patch_size): + # # keep track of which dimensions of the patch we have iterated over + # vectorized_patch_offset = 1 + + # # Once the vectorized top-left-seed is unraveled, you can add the patch + # # points in the array structure and compute their vectorized (unraveled) + # # points, which are added to the projection vector + # unravel_index_cython(top_left_patch_seed, self.data_dims, self.unraveled_patch_point) + + # for dim_idx in range(self.ndim): + # # compute the offset from the top-left patch seed based on: + # # 1. the current patch index + # # 2. the patch dimension indexed by `dim_idx` + # # 3. and the vectorized patch dimensions that we have seen so far + # # the `vectorized_point_offset` is the offset from the top-left vectorized seed for this dimension + # vectorized_point_offset = (patch_idx // (vectorized_patch_offset)) % patch_dims[dim_idx] + + # # then we compute the actual point in the original data shape + # self.unraveled_patch_point[dim_idx] = self.unraveled_patch_point[dim_idx] + vectorized_point_offset + # vectorized_patch_offset *= patch_dims[dim_idx] + + # # if any dimensions are discontiguous, we want to migrate the entire axis a fixed amount + # # based on the shuffling + # if self._discontiguous is True: + # for dim_idx in range(self.ndim): + # if self.dim_contiguous[dim_idx] is True: + # continue + + # # determine the "row" we are currently on + # # other_dims_offset = 1 + # # for idx in range(dim_idx + 1, self.ndim): + # # other_dims_offset *= self.data_dims[idx] + # # row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset + # # determine the "row" we are currently on + # other_dims_offset = 1 + # for idx in range(dim_idx + 1, self.ndim): + # if not self.dim_contiguous[idx]: + # other_dims_offset *= self.data_dims[idx] + + # row_index = 0 + # for idx in range(dim_idx + 1, self.ndim): + # if not self.dim_contiguous[idx]: + # row_index += ( + # (self.unraveled_patch_point[idx] // other_dims_offset) % + # self.patch_sampled_size[idx] + # ) * other_dims_offset + # other_dims_offset //= self.data_dims[idx] + + # # assign random row index now + # self.unraveled_patch_point[dim_idx] = self._index_patch_buffer[row_index] + + # # ravel the patch point into the original data dimensions + # vectorized_point = ravel_multi_index_cython(self.unraveled_patch_point, self.data_dims) + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + cdef void compute_features_over_samples( self, intp_t start, @@ -411,18 +765,62 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): const intp_t[:] samples, float32_t[:] feature_values, vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + + Parameters + ---------- + start : int + The start index of the samples. + end : int + The end index of the samples. + samples : array-like, shape (n_samples,) + The indices of the samples. + feature_values : array-like, shape (n_samples,) + The pre-allocated buffer to store the feature values. + proj_vec_weights : array-like, shape (max_features,) + The weights of the projection vector. + proj_vec_indices : array-like, shape (max_features,) + The indices of the projection vector. This will only store the vectorized + index of the top-left-seed. """ cdef intp_t idx, jdx # initialize feature weight to normalize across patch cdef float32_t patch_weight + if proj_vec_indices.size() != 1: + with gil: + raise ValueError("proj_vec_indices should only have one element corresponding to the top left seed.") + + # patch dimensions + cdef intp_t[:] patch_dims = self.patch_sampled_size + cdef vector[intp_t] top_left_seed = vector[intp_t](self.ndim) + + cdef intp_t volume_of_patch = 1 + # Calculate the total number of Cartesian products + for i in range(self.ndim): + volume_of_patch *= len(self.patch_nd_indices[i]) + + # create a buffer to store the raveled indices of the patch, which will be used + # to compute the relevant feature values + cdef vector[intp_t] raveled_patch_indices = vector[intp_t](volume_of_patch) + for jdx in range(0, proj_vec_indices.size()): + # get the index of the top-left patch + top_left_patch_index = deref(proj_vec_indices)[jdx] + + # compute the raveled index of all the points in the patch + compute_vectorized_indices_from_cartesian( + top_left_patch_index, + self.patch_nd_indices, + self.data_dims, + raveled_patch_indices + ) + # Compute linear combination of features and then # sort samples according to the feature values. for idx in range(start, end): @@ -430,15 +828,17 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # initialize the feature value to 0 feature_values[idx] = 0 - for jdx in range(0, proj_vec_indices.size()): + for kdx in range(raveled_patch_indices.size()): + feature_index = raveled_patch_indices[kdx] + feature_values[idx] += self.X[ - samples[idx], deref(proj_vec_indices)[jdx] + samples[idx], feature_index ] * deref(proj_vec_weights)[jdx] - if self.feature_weight is not None: - # gets the feature weight for this specific column from X - # the default of feature_weights[i] is (1/n_features) for all i - patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] + if self.feature_weight is not None: + # gets the feature weight for this specific column from X + # the default of feature_weights[i] is (1/n_features) for all i + patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] if self.feature_weight is not None: feature_values[idx] /= patch_weight @@ -448,7 +848,7 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): """A class to expose a Python interface for testing.""" cpdef sample_top_left_seed_cpdef(self): top_left_patch_seed, patch_size = self.sample_top_left_seed() - patch_dims = np.array(self.patch_dims_buff, dtype=np.intp) + patch_dims = np.array(self.patch_sampled_size, dtype=np.intp) return top_left_patch_seed, patch_size, patch_dims cpdef sample_projection_vector( @@ -467,7 +867,6 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): proj_mat_weights, proj_mat_indices, proj_i, - patch_size, top_left_patch_seed, patch_dims ) diff --git a/treeple/tree/tests/meson.build b/treeple/tree/tests/meson.build index b0ccb9b5e..cbdd38f71 100644 --- a/treeple/tree/tests/meson.build +++ b/treeple/tree/tests/meson.build @@ -8,6 +8,7 @@ python_sources = [ 'test_unsupervised_tree.py', 'test_multiview.py', 'test_kernel.py', + 'test_manifold.py', ] py.install_sources( diff --git a/treeple/tree/tests/test_manifold.py b/treeple/tree/tests/test_manifold.py new file mode 100644 index 000000000..f352cc1e3 --- /dev/null +++ b/treeple/tree/tests/test_manifold.py @@ -0,0 +1,89 @@ +import joblib +import numpy as np +import pytest +from sklearn import datasets +from numpy.testing import assert_almost_equal, assert_array_equal +from sklearn.base import is_classifier +from sklearn.datasets import load_iris, make_blobs +from sklearn.tree._tree import TREE_LEAF + +from treeple.tree import ( + ExtraObliqueDecisionTreeClassifier, + ExtraObliqueDecisionTreeRegressor, + ObliqueDecisionTreeClassifier, + ObliqueDecisionTreeRegressor, + PatchObliqueDecisionTreeClassifier, + PatchObliqueDecisionTreeRegressor, + UnsupervisedDecisionTree, + UnsupervisedObliqueDecisionTree, +) + +ALL_TREES = [ + ExtraObliqueDecisionTreeRegressor, + ObliqueDecisionTreeRegressor, + PatchObliqueDecisionTreeRegressor, + ExtraObliqueDecisionTreeClassifier, + ObliqueDecisionTreeClassifier, + PatchObliqueDecisionTreeClassifier, + UnsupervisedDecisionTree, + UnsupervisedObliqueDecisionTree, +] + +rng = np.random.RandomState(1) + +# load digits dataset and randomly permute it +digits = datasets.load_digits() +perm = rng.permutation(digits.target.size) +digits.data = digits.data[perm] +digits.target = digits.target[perm] + + + +def test_splitters(): + """Test that splitters are picklable.""" + X, y = digits.data, digits.target + X = X.astype(np.float32) + sample_weight = np.ones(len(y), dtype=np.float64).squeeze() + y = y.reshape(-1, 1).astype(np.float64) + missing_values_in_feature_mask = None + + print(X.shape, y.shape) + from treeple._lib.sklearn.tree._criterion import Gini + from treeple.tree.manifold._morf_splitter import BestPatchSplitterTester + + criterion = Gini(1, np.array((0, 1), dtype=np.intp)) + max_features = 6 + min_samples_leaf = 1 + min_weight_leaf = 0.0 + monotonic_cst = np.array([1, 1, 1, 1, 1, 1], dtype=np.int8) + random_state = np.random.RandomState(100) + boundary = None + feature_weight = None + min_patch_dims = np.array((1, 1), dtype=np.intp) + max_patch_dims = np.array((3, 1), dtype=np.intp) + dim_contiguous = np.array((True, True)) + data_dims = np.array((8, 8), dtype=np.intp) + + feature_combinations = 1.5 + + splitter = BestPatchSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, + min_patch_dims, + max_patch_dims, + dim_contiguous, + data_dims, + boundary, + feature_weight, + ) + splitter.init_test( + X, + y, + sample_weight, + ) + assert_array_equal(splitter.X_reshaped.reshape(-1, 64), X) From 3f5366fea6f12e702ccc19e5ed2acabd8f5ee872 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Aug 2024 23:26:15 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- treeple/tree/manifold/_morf_splitter.pyx | 11 +++++++++-- treeple/tree/tests/test_manifold.py | 8 +------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index f335999c4..dc4dffdf5 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -13,10 +13,17 @@ from cython.operator cimport dereference as deref from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion -from ..._lib.sklearn.tree._utils cimport rand_int from ..._lib.sklearn.tree._tree cimport ParentInfo -from .._utils cimport init_2dmemoryview, ravel_multi_index_cython, unravel_index_cython, fisher_yates_shuffle, compute_vectorized_indices_from_cartesian +from ..._lib.sklearn.tree._utils cimport rand_int from .._sklearn_splitter cimport sort +from .._utils cimport ( + compute_vectorized_indices_from_cartesian, + fisher_yates_shuffle, + init_2dmemoryview, + ravel_multi_index_cython, + unravel_index_cython, +) + cdef float64_t INFINITY = np.inf diff --git a/treeple/tree/tests/test_manifold.py b/treeple/tree/tests/test_manifold.py index f352cc1e3..6f580a957 100644 --- a/treeple/tree/tests/test_manifold.py +++ b/treeple/tree/tests/test_manifold.py @@ -1,11 +1,6 @@ -import joblib import numpy as np -import pytest +from numpy.testing import assert_array_equal from sklearn import datasets -from numpy.testing import assert_almost_equal, assert_array_equal -from sklearn.base import is_classifier -from sklearn.datasets import load_iris, make_blobs -from sklearn.tree._tree import TREE_LEAF from treeple.tree import ( ExtraObliqueDecisionTreeClassifier, @@ -38,7 +33,6 @@ digits.target = digits.target[perm] - def test_splitters(): """Test that splitters are picklable.""" X, y = digits.data, digits.target