diff --git a/examples/manifold/README.txt b/examples/manifold/README.txt new file mode 100644 index 000000000..8b1808911 --- /dev/null +++ b/examples/manifold/README.txt @@ -0,0 +1,6 @@ +.. _manifold_examples: + +Manifold learning with Decision-trees +------------------------------------- + +Examples demonstrating manifold learning using random forest variants. diff --git a/examples/manifold/plot_kernel_decision_tree.py b/examples/manifold/plot_kernel_decision_tree.py new file mode 100644 index 000000000..35abb9740 --- /dev/null +++ b/examples/manifold/plot_kernel_decision_tree.py @@ -0,0 +1,106 @@ +""" +====================================== +Custom Kernel Decision Tree Classifier +====================================== + +This example shows how to build a manifold oblique decision tree classifier using +a custom set of user-defined kernel/filter library, such as the Gaussian, or Gabor +kernels. + +The example demonstrates superior performance on a 2D dataset with structured images +as samples. The dataset is the downsampled MNIST dataset, where each sample is a +28x28 image. The dataset is downsampled to 14x14, and then flattened to a 196 +dimensional vector. The dataset is then split into a training and testing set. + +See :ref:`sphx_glr_auto_examples_plot_projection_matrices` for more information on +projection matrices and the way they can be sampled. +""" + +import matplotlib.pyplot as plt + +# %% +# Importing the necessary modules +from sklearn.datasets import fetch_openml +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +from treeple.tree import KernelDecisionTreeClassifier + +# %% +# Load the Dataset +# ---------------- +# We need to load the dataset and split it into training and testing sets. + +# Load the dataset +X, y = fetch_openml("mnist_784", version=1, return_X_y=True) + +# Downsample the dataset +X = X.reshape((-1, 28, 28)) +X = X[:, ::2, ::2] +X = X.reshape((-1, 196)) + +# Split the dataset into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# %% +# Setting up the Custom Kernel Decision Tree Model +# ------------------------------------------------- +# To set up the custom kernel decision tree model, we need to define typical hyperparameters +# for the decision tree classifier, such as the maximum depth of the tree and the minimum +# number of samples required to split an internal node. For the Kernel Decision tree model, +# we also need to define the kernel function and its parameters. + +max_depth = 10 +min_samples_split = 2 + +# Next, we define the hyperparameters for the custom kernels that we will use. +# For example, if we want to use a Gaussian kernel with a sigma of 1.0 and a size of 3x3: +kernel_function = "gaussian" +kernel_params = {"sigma": 1.0, "size": (3, 3)} + +# We can then fit the custom kernel decision tree model to the training set: +clf = KernelDecisionTreeClassifier( + max_depth=max_depth, + min_samples_split=min_samples_split, + data_dims=(28, 28), + min_patch_dims=(1, 1), + max_patch_dims=(14, 14), + dim_contiguous=(True, True), + boundary=None, + n_classes=10, + kernel_function=kernel_function, + n_kernels=500, + store_kernel_library=True, +) + +# Fit the decision tree classifier using the custom kernel +clf.fit(X_train, y_train) + +# %% +# Evaluating the Custom Kernel Decision Tree Model +# ------------------------------------------------ +# To evaluate the custom kernel decision tree model, we can use the testing set. +# We can also inspect the important kernels that the tree selected. + +# Predict the labels for the testing set +y_pred = clf.predict(X_test) + +# Compute the accuracy score +accuracy = accuracy_score(y_test, y_pred) + +print(f"Kernel decision tree model obtained an accuracy of {accuracy} on MNIST.") + +# Get the important kernels from the decision tree classifier +important_kernels = clf.kernel_arr_ +kernel_dims = clf.kernel_dims_ +kernel_params = clf.kernel_params_ +kernel_library = clf.kernel_library_ + +# Plot the important kernels +fig, axes = plt.subplots( + nrows=len(important_kernels), ncols=1, figsize=(6, 4 * len(important_kernels)) +) +for i, kernel in enumerate(important_kernels): + axes[i].imshow(kernel, cmap="gray") + axes[i].set_title("Kernel {}".format(i + 1)) +plt.show() diff --git a/examples/splitters/plot_kernel_splitters.py b/examples/splitters/plot_kernel_splitters.py new file mode 100644 index 000000000..d95a388e6 --- /dev/null +++ b/examples/splitters/plot_kernel_splitters.py @@ -0,0 +1,77 @@ +""" +==================== +Random Kernel Splits +==================== + +This example shows how to build a manifold oblique decision tree classifier using +a custom set of user-defined kernel/filter library, such as the Gaussian, or Gabor +kernels. + +The example demonstrates superior performance on a 2D dataset with structured images +as samples. The dataset is the downsampled MNIST dataset, where each sample is a +28x28 image. The dataset is downsampled to 14x14, and then flattened to a 196 +dimensional vector. The dataset is then split into a training and testing set. + +See :ref:`sphx_glr_auto_examples_plot_projection_matrices` for more information on +projection matrices and the way they can be sampled. +""" + +import matplotlib.pyplot as plt +import numpy as np +from scipy.sparse import csr_matrix + +from treeple.tree.manifold._kernel_splitter import Kernel2D + +# %% +# Create a synthetic image +image_height, image_width = 50, 50 +image = np.random.rand(image_height, image_width).astype(np.float32) + +# Generate a Gaussian kernel (example) +kernel_size = 7 +x = np.linspace(-2, 2, kernel_size) +y = np.linspace(-2, 2, kernel_size) +x, y = np.meshgrid(x, y) +kernel = np.exp(-(x**2 + y**2)) +kernel = kernel / kernel.sum() # Normalize the kernel + +# Vectorize and create a sparse CSR matrix +kernel_vector = kernel.flatten().astype(np.float32) +kernel_indices = np.arange(kernel_vector.size) +kernel_indptr = np.array([0, kernel_vector.size]) +kernel_csr = csr_matrix( + (kernel_vector, kernel_indices, kernel_indptr), shape=(1, kernel_vector.size) +) + +# %% +# Initialize the Kernel2D class +kernel_sizes = np.array([kernel_size], dtype=np.intp) +random_state = np.random.RandomState(42) +print(kernel_csr.dtype, kernel_sizes.dtype, np.intp) +kernel_2d = Kernel2D(kernel_csr, kernel_sizes, random_state) + +# Apply the kernel to the image +result_value = kernel_2d.apply_kernel_py(image, 0) + +# %% +# Plot the original image, kernel, and result +fig, axs = plt.subplots(1, 3, figsize=(15, 5)) + +axs[0].imshow(image, cmap="gray") +axs[0].set_title("Original Image") + +axs[1].imshow(kernel, cmap="viridis") +axs[1].set_title("Gaussian Kernel") + +# Highlight the region where the kernel was applied +start_x, start_y = random_state.randint(0, image_width - kernel_size + 1), random_state.randint( + 0, image_height - kernel_size + 1 +) +image_with_kernel = image.copy() +image_with_kernel[start_y : start_y + kernel_size, start_x : start_x + kernel_size] *= kernel + +axs[2].imshow(image_with_kernel, cmap="gray") +axs[2].set_title(f"Result: {result_value:.4f}") + +plt.tight_layout() +plt.show() diff --git a/pyproject.toml b/pyproject.toml index c0a50d95a..b3acf7f84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -195,6 +195,7 @@ ignore_missing_imports = true no_site_packages = true exclude = [ 'treeple/_lib/', + 'treeple/_lib/*', 'benchmarks_nonasv/' ] diff --git a/treeple/tree/__init__.py b/treeple/tree/__init__.py index 797338ac3..a7425c213 100644 --- a/treeple/tree/__init__.py +++ b/treeple/tree/__init__.py @@ -15,6 +15,7 @@ UnsupervisedObliqueDecisionTree, ) from ._honest_tree import HonestTreeClassifier +from ._kernel import KernelDecisionTreeClassifier from ._multiview import MultiViewDecisionTreeClassifier from ._neighbors import compute_forest_similarity_matrix @@ -34,4 +35,5 @@ "ExtraTreeClassifier", "ExtraTreeRegressor", "MultiViewDecisionTreeClassifier", + "KernelDecisionTreeClassifier", ] diff --git a/treeple/tree/_kernel.py b/treeple/tree/_kernel.py new file mode 100644 index 000000000..7b7347a3b --- /dev/null +++ b/treeple/tree/_kernel.py @@ -0,0 +1,595 @@ +import copy + +import numpy as np +from scipy.sparse import csr_matrix, issparse + +from .._lib.sklearn.tree._criterion import BaseCriterion +from .._lib.sklearn.tree._tree import BestFirstTreeBuilder, DepthFirstTreeBuilder +from ._classes import CRITERIA_CLF, PATCH_DENSE_SPLITTERS, PatchObliqueDecisionTreeClassifier +from ._oblique_tree import ObliqueTree + + +def gaussian_kernel(shape, sigma=1.0, mu=0.0): + """N-dimensional gaussian kernel for the given shape. + + See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e + """ + m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape]) + d = np.sqrt(np.sum([x * x for x in m], axis=0)) + g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2))) + return g / np.sum(g) + + +def sample_gaussian_kernels(n_kernels, min_size, max_size, mu=0.0, sigma=1.0): + """ + Sample a set of Gaussian kernels and arrange them into a sparse kernel matrix. + + Parameters: + ----------- + n_kernels : int + The number of Gaussian kernels to sample. + min_size : int + The minimum size of the kernel (inclusive). + max_size : int + The maximum size of the kernel (inclusive). + mu : float or tuple of floats + The mean(s) of the Gaussian distribution. If a tuple, random mean values are sampled. + sigma : float or tuple of floats + The standard deviation(s) of the Gaussian distribution. If a tuple, random sigma values are sampled. + + Returns: + -------- + kernel_matrix : csr_matrix + The sparse matrix containing vectorized kernels. + kernel_params : dict of arrays + The parameters of the kernels that were sampled with keys: + - 'size': the size of each kernel; since the kernels are 2D square matrices + this is the side length of the kernel. + - 'mu': the mean of each kernel + - 'sigma': the standard deviation + """ + data = [] + indices = [] + indptr = [0] + kernel_params = { + "size": np.zeros(n_kernels, dtype=np.intp), + "mu": np.zeros(n_kernels), + "sigma": np.zeros(n_kernels), + } + + for i in range(n_kernels): + # Sample the size of the kernel + size = np.random.randint(min_size, max_size + 1) + + # Sample mu and sigma if they are tuples + if isinstance(mu, tuple): + mu_sample = np.random.uniform(mu[0], mu[1]) + else: + mu_sample = mu + + if isinstance(sigma, tuple): + sigma_sample = np.random.uniform(sigma[0], sigma[1]) + else: + sigma_sample = sigma + + # Create a meshgrid for the kernel + x = np.linspace(-1, 1, size) + y = np.linspace(-1, 1, size) + X, Y = np.meshgrid(x, y) + + # Create the Gaussian kernel + kernel = np.exp(-((X - mu_sample) ** 2 + (Y - mu_sample) ** 2) / (2 * sigma_sample**2)) + + # Vectorize the kernel and store it in the sparse matrix format + kernel_vectorized = kernel.flatten() + data.extend(kernel_vectorized) + indices.extend(range(size * size)) + indptr.append(len(data)) + + # Store the kernel parameters + kernel_params["size"][i] = size + kernel_params["mu"][i] = mu_sample + kernel_params["sigma"][i] = sigma_sample + + # Convert lists to the appropriate format + data = np.array(data) + indices = np.array(indices) + indptr = np.array(indptr) + + # Create a sparse matrix + kernel_matrix = csr_matrix((data, indices, indptr), shape=(n_kernels, max_size * max_size)) + + return kernel_matrix, kernel_params + + +class KernelDecisionTreeClassifier(PatchObliqueDecisionTreeClassifier): + """Oblique decision tree classifier over data patches combined with Gaussian kernels. + + Parameters + ---------- + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + splitter : {"best", "random"}, default="best" + The strategy used to choose the split at each node. Supported + strategies are "best" to choose the best split and "random" to choose + the best random split. + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + max_features : int, float or {"auto", "sqrt", "log2"}, default=None + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `int(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + Note: Compared to axis-aligned Random Forests, one can set + max_features to a number greater then ``n_features``. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + The weighted impurity decrease equation is the following:: + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + class_weight : dict, list of dict or "balanced", default=None + Weights associated with classes in the form ``{class_label: weight}``. + If None, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + For multi-output, the weights of each column of y will be multiplied. + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + min_patch_dim : array-like, optional + The minimum dimensions of a patch, by default 1 along all dimensions. + max_patch_dim : array-like, optional + The maximum dimensions of a patch, by default 1 along all dimensions. + dim_contiguous : array-like of bool, optional + Whether or not each patch is sampled contiguously along this dimension. + data_dims : array-like, optional + The presumed dimensions of the un-vectorized feature vector, by default + will be a 1D vector with (1, n_features) shape. + boundary : optional, str {'wrap'} + The boundary condition to use when sampling patches, by default None. + 'wrap' corresponds to the boundary condition as is in numpy and scipy. + feature_weight : array-like of shape (n_samples,n_features,), default=None + Feature weights. If None, then features are equally weighted as is. + If provided, then the feature weights are used to weight the + patches that are generated. The feature weights are used + as follows: for every patch that is sampled, the feature weights over + the entire patch is summed and normalizes the patch. + kernel : str {'gaussian', 'uniform'}, default='gaussian' + The kernel to use. + n_kernels : int, optional + The number of different kernels to generate. This number should be very high + as this generates kernels + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) or list of ndarray + The classes labels (single output problem), + or a list of arrays of class labels (multi-output problem). + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance [4]_. + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + max_features_ : int + The inferred value of max_features. + n_classes_ : int or list of int + The number of classes (for single output problems), + or a list containing the number of classes for each + output (for multi-output problems). + n_features_in_ : int + Number of features seen during :term:`fit`. + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + n_outputs_ : int + The number of outputs when ``fit`` is performed. + tree_ : Tree instance + The underlying Tree object. Please refer to + ``help(sklearn.tree._tree.Tree)`` for + attributes of Tree object. + min_patch_dims_ : array-like + The minimum dimensions of a patch. + max_patch_dims_ : array-like + The maximum dimensions of a patch. + data_dims_ : array-like + The presumed dimensions of the un-vectorized feature vector. + kernel_arr_ : list of length (n_nodes,) of array-like of shape (patch_dims,) + The kernel array that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + kernel_params_ : list of length (n_nodes,) + The parameters of the kernel that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + kernel_library_ : array-like of shape (n_kernels,), optional + The library of kernels that was chosen from the patches for this tree. + Only stored if ``store_kernel_library`` is set to True. + kernel_library_params_ : list of length (n_nodes,), optional + The parameters of the kernels that was chosen from the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + Only stored if ``store_kernel_library`` is set to True. + References + ---------- + .. footbibliography:: + """ + + _parameter_constraints: dict = {**PatchObliqueDecisionTreeClassifier._parameter_constraints} + _parameter_constraints.update( + { + "kernel": ["str"], + "n_kernels": [int, None], + "store_kernel_library": [bool], + } + ) + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + kernel="gaussian", + n_kernels=None, + store_kernel_library=False, + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, + random_state=random_state, + min_impurity_decrease=min_impurity_decrease, + ) + + self.min_patch_dims = min_patch_dims + self.max_patch_dims = max_patch_dims + self.dim_contiguous = dim_contiguous + self.data_dims = data_dims + self.boundary = boundary + self.feature_weight = feature_weight + + self.kernel = kernel + self.n_kernel = n_kernels + self.store_kernel_library = store_kernel_library + + def _build_tree( + self, + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + min_samples_leaf : int or float + The minimum number of samples required to be at a leaf node. + min_weight_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights. + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. + """ + # Build tree + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + splitter = self.splitter + if issparse(X): + raise ValueError( + "Sparse input is not supported for oblique trees. " + "Please convert your data to a dense array." + ) + else: + PATCH_SPLITTERS = PATCH_DENSE_SPLITTERS + + # compute user-defined Kernel library before + kernel_library, kernel_dims, kernel_params = self._sample_kernel_library(X, y) + + # Note: users cannot define a splitter themselves + splitter = PATCH_SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + self.min_patch_dims_, + self.max_patch_dims_, + self.dim_contiguous_, + self.data_dims_, + self.boundary, + self.feature_weight, + kernel_library, + ) + + self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + ) + + builder.build(self.tree_, X, y, sample_weight) + + # Set some attributes based on what kernel indices were used in each tree + # - construct a tree-nodes like array and set it as a Python attribute, so it is + # exposed to the Python interface + # - use the Cython tree.feature array to store the index of the dictionary library + # that was used to split at each node + kernel_idx_chosen = self.tree_.feature + kernel_lib_chosen = kernel_library[kernel_idx_chosen] + kernel_params_chosen = kernel_params[kernel_idx_chosen] + self.kernel_arr_ = kernel_lib_chosen + self.kernel_dims_ = kernel_dims[kernel_idx_chosen] + self.kernel_params_ = kernel_params_chosen + + if self.store_kernel_library: + self.kernel_library_ = kernel_library + self.kernel_idx_chosen_ = kernel_idx_chosen + + if self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples the dictionary library that is sampled from in the random forest. + A kernel can either be applied with the boundaries of the image in mind, such that + the patch can be uniformly applied across all indices of rows of ``X_structured``. This + is equivalent to passing in ``boundary = 'wrap'``. Alternatively, a kernel can be sampled, + such that the patch is always contained within the boundary of the image. This is equivalent + to passing in ``boundary = None``. + Parameters + ---------- + X : _type_ + _description_ + y : _type_ + _description_ + sample_weight : _type_ + _description_ + Returns + ------- + kernel_library : array-like of shape (n_kernels, n_features) + The dictionary that will be sampled from the random forest. Non-zero entries indicate + where the kernel is applied. This is a n-dimensional kernel matrix that is vectorized + and placed into the original ``X`` data dimensions of (n_dims,). + kernel_dims : list of (n_kernels,) length + The dimensions of each kernel that is sampled. For example, (n_dims_x, n_dims_y) in + the first element indicates that the first kernel has a shape of (n_dims_x, n_dims_y). + This can be used to then place where the top-left seed of each kernel is applied. + kernel_params : list of (n_kernels,) length + A list of dictionaries representing the parameters of each kernel. This is used to + keep track of what kernels and parameterizations were valuable when used in the random + forest. + """ + raise NotImplementedError("This method should be implemented in a child class.") + + +class GaussianKernelDecisionTreeClassifier(KernelDecisionTreeClassifier): + _parameter_constraints = { + **KernelDecisionTreeClassifier._parameter_constraints, + # "mu_bounds": ['array-like'], + # "sigma_bounds": ['array-like'], + } + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + n_kernels=None, + store_kernel_library=False, + mu_bounds=(0, 1), + sigma_bounds=(0, 1), + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + random_state=random_state, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + class_weight=class_weight, + min_patch_dims=min_patch_dims, + max_patch_dims=max_patch_dims, + dim_contiguous=dim_contiguous, + data_dims=data_dims, + boundary=boundary, + feature_weight=feature_weight, + kernel="gaussian", + n_kernels=n_kernels, + store_kernel_library=store_kernel_library, + ) + self.mu_bounds = mu_bounds + self.sigma_bounds = sigma_bounds + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples a gaussian kernel library.""" + rng = np.random.default_rng(self.random_state) + kernel_library = [] + kernel_params = [] + kernel_dims = [] + + # Sample the kernel library + ndim = len(self.data_dims_) + for _ in range(self.n_kernels): + patch_shape = [] + for idx in range(ndim): + # Note: By constraining max patch height/width to be at least the min + # patch height/width this ensures that the minimum value of + # patch_height and patch_width is 1 + patch_dim = rng.randint(self.min_patch_dims_[idx], self.max_patch_dims_[idx] + 1) + + # sample the possible patch shape given input parameters + if self.boundary == "wrap": + # add circular boundary conditions + delta_patch_dim = self.data_dims_[idx] + 2 * (patch_dim - 1) + + # sample the top left index for this dimension + top_left_patch_seed = rng.randint(0, delta_patch_dim) + + # resample the patch dimension due to padding + dim = top_left_patch_seed % delta_patch_dim + + # resample the patch dimension due to padding + patch_dim = min( + patch_dim, min(dim + 1, self.data_dims_[idx] + patch_dim - dim - 1) + ) + + patch_shape.append(patch_dim) + + patch_shape = np.array(patch_shape) + + # sample the sigma and mu parameters + sigma = rng.uniform(low=self.mu_bounds[0], high=self.mu_bounds[1]) + mu = rng.uniform(low=self.sigma_bounds[0], high=self.sigma_bounds[1]) + + kernel = gaussian_kernel(shape=patch_shape, sigma=sigma, mu=mu) + + kernel_dims.append(kernel.shape) + kernel_library.append(kernel) + kernel_params.append({"shape": patch_shape, "sigma": sigma, "mu": mu}) + + return kernel_library, kernel_dims, kernel_params diff --git a/treeple/tree/_oblique_splitter.pxd b/treeple/tree/_oblique_splitter.pxd index 65ca16e14..1f2805a2b 100644 --- a/treeple/tree/_oblique_splitter.pxd +++ b/treeple/tree/_oblique_splitter.pxd @@ -59,6 +59,12 @@ cdef class BaseObliqueSplitter(Splitter): vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil + cdef void sample_proj_vec( + self, + vector[float32_t]& proj_vec_weights, + vector[intp_t]& proj_vec_indices + ) noexcept nogil + # Redefined here since the new logic requires calling sample_proj_mat cdef int node_reset( self, diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index 0cceac664..a65ae19df 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -98,7 +98,7 @@ cdef class BaseObliqueSplitter(Splitter): const intp_t[:] samples, float32_t[:] feature_values, vector[float32_t]* proj_vec_weights, # weights of the vector (n_non_zeros,) - vector[intp_t]* proj_vec_indices # indices of the features (n_non_zeros,) + vector[intp_t]* proj_vec_indices # indices of the features (n_non_zeros,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -121,6 +121,12 @@ cdef class BaseObliqueSplitter(Splitter): feature_values[idx] = 0.0 feature_values[idx] += self.X[samples[idx], col_idx] * col_weight + cdef void sample_proj_vec( + self, + vector[float32_t]& proj_vec_weights, + vector[intp_t]& proj_vec_indices + ) noexcept nogil: + pass cdef class ObliqueSplitter(BaseObliqueSplitter): def __cinit__( @@ -202,6 +208,43 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # self.feature_weights = np.ones((self.n_features,), dtype=float32_t) / self.n_features return 0 + cdef void sample_proj_vec( + self, + vector[float32_t]& proj_vec_weights, + vector[intp_t]& proj_vec_indices + ) noexcept nogil: + cdef intp_t n_features = self.n_features + cdef intp_t n_non_zeros = self.n_non_zeros + cdef uint32_t* random_state = &self.rand_r_state + + cdef intp_t i, feat_i, proj_i, rand_vec_index + cdef float32_t weight + + # construct an array to sample from mTry x n_features set of indices + cdef intp_t[::1] indices_to_sample = self.indices_to_sample + cdef intp_t grid_size = self.max_features * self.n_features + + # shuffle indices over the 2D grid to sample using Fisher-Yates + fisher_yates_shuffle(indices_to_sample, grid_size, random_state) + + # sample 'n_non_zeros' in a mtry X n_features projection matrix + # which consists of +/- 1's chosen at a 1/2s rate + for i in range(0, n_non_zeros): + # get the next index from the shuffled index array + rand_vec_index = indices_to_sample[i] + + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + proj_i = rand_vec_index // n_features + feat_i = rand_vec_index % n_features + + # sample a random weight + weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + + proj_vec_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_vec_weights[proj_i].push_back(weight) # Store weight of nonzero + + cdef void sample_proj_mat( self, vector[vector[float32_t]]& proj_mat_weights, @@ -322,6 +365,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # XXX: 'feature' is not actually used in oblique split records # Just indicates which split was sampled current_split.feature = feat_i + + # sample the projection vector current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index ba2707791..8db48e8a1 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -6,6 +6,8 @@ cimport numpy as cnp cnp.import_array() +from libcpp.vector cimport vector + from .._lib.sklearn.tree._splitter cimport SplitRecord from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t @@ -22,7 +24,7 @@ cdef void fisher_yates_shuffle( ) noexcept nogil -cdef int rand_weighted_binary( +cdef intp_t rand_weighted_binary( float64_t p0, uint32_t* random_state ) noexcept nogil @@ -47,3 +49,15 @@ cdef intp_t ravel_multi_index_cython( vector_or_memview coords, const intp_t[:] shape ) noexcept nogil + +cdef void compute_vectorized_indices_from_cartesian( + intp_t base_index, + vector[vector[intp_t]]& index_arrays, + const intp_t[:] data_dims, + vector[intp_t]& result +) noexcept nogil + +cdef memoryview[float32_t, ndim=3] init_2dmemoryview( + cnp.ndarray array, + const intp_t[:] data_dims +) diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx index 7ce48977b..b85948c03 100644 --- a/treeple/tree/_utils.pyx +++ b/treeple/tree/_utils.pyx @@ -5,6 +5,8 @@ # cython: wraparound=False # cython: initializedcheck=False +from libcpp.vector cimport vector + import numpy as np cimport numpy as cnp @@ -20,6 +22,7 @@ cdef inline void fisher_yates_shuffle( uint32_t* random_state, ) noexcept nogil: """Shuffle the indices in place using the Fisher-Yates algorithm. + Parameters ---------- indices_to_sample : A C++ vector or 1D memoryview @@ -41,7 +44,7 @@ cdef inline void fisher_yates_shuffle( indices_to_sample[i], indices_to_sample[j] -cdef inline int rand_weighted_binary( +cdef inline intp_t rand_weighted_binary( float64_t p0, uint32_t* random_state ) noexcept nogil: @@ -61,7 +64,8 @@ cdef inline int rand_weighted_binary( else: return 1 -cpdef unravel_index( + +cdef inline void unravel_index_cython( intp_t index, cnp.ndarray[intp_t, ndim=1] shape ): @@ -150,7 +154,7 @@ cdef inline intp_t ravel_multi_index_cython( Parameters ---------- coords : intp_t[:] or vector[intp_t] - An array of coordinates to be converted and vectorized into a sinlg + An array of coordinates to be converted and vectorized into a single index. shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the coordinates should be converted. @@ -180,3 +184,146 @@ cdef inline intp_t ravel_multi_index_cython( flat_index *= shape[i + 1] return flat_index + + +cdef inline void compute_vectorized_indices_from_cartesian( + intp_t base_index, + vector[vector[intp_t]]& index_arrays, + const intp_t[:] data_dims, + vector[intp_t]& result +) noexcept nogil: + """ + Compute the vectorized indices for all index sets from the Cartesian product of index arrays. + + :param base_index: The initial vectorized index (top-left point in the patch). + :param index_arrays: A list of lists, where each sublist contains the indices for that dimension. + :param data_dims: The dimensions of the data. + :param result: C++ vector to store the vectorized indices. + """ + cdef int n_dims = len(index_arrays) + cdef vector[intp_t] indices = vector[intp_t](n_dims) + cdef intp_t vectorized_index + cdef intp_t i + cdef intp_t n_products = 1 + + # Calculate the total number of Cartesian products + for i in range(n_dims): + n_products *= len(index_arrays[i]) + + # Initialize the current indices for each dimension + cdef vector[intp_t] current_indices = vector[intp_t](n_dims) + for i in range(n_dims): + current_indices[i] = 0 + + # Iterate over Cartesian products and compute vectorized indices + for _ in range(n_products): + # Compute the vectorized index for the current combination + for i in range(n_dims): + indices[i] = index_arrays[i][current_indices[i]] + vectorized_index = ravel_multi_index_cython( + indices, data_dims + ) + result.push_back(vectorized_index) + + # Move to the next combination + for i in range(n_dims - 1, -1, -1): + current_indices[i] += 1 + if current_indices[i] < len(index_arrays[i]): + break + current_indices[i] = 0 + if i == 0: + return # Finished all combinations + + +cdef vector[vector[intp_t]] cartesian_cython( + vector[vector[intp_t]] sequences +) noexcept nogil: + cdef vector[vector[intp_t]] results = vector[vector[intp_t]](1) + cdef vector[vector[intp_t]] next_results + for new_values in sequences: + for result in results: + for value in new_values: + result_copy = result + result_copy.push_back(value) + next_results.push_back(result_copy) + results = next_results + next_results.clear() + return results + + +cpdef cartesian_python(vector[vector[intp_t]]& sequences): + return cartesian_cython(sequences) + + +cdef memoryview[float32_t, ndim=3] init_2dmemoryview( + cnp.ndarray array, + const intp_t[:] data_dims +): + cdef intp_t n_samples = array.shape[0] + cdef intp_t ndim = data_dims.shape[0] + assert ndim == 2, "Currently only 2D images are accepted." + + # Reshape the array into (n_samples, *data_dims) + cdef cnp.ndarray reshaped_array = array.reshape((n_samples, data_dims[0], data_dims[1])) + + # Create a memoryview with the appropriate number of dimensions + cdef memoryview[float32_t, ndim=3] view = reshaped_array + return view + +#################################################################################################### +# CPDEF functions for testing purposes +#################################################################################################### + +cpdef unravel_index( + intp_t index, + cnp.ndarray[intp_t, ndim=1] shape +): + """Converts a flat index or array of flat indices into a tuple of coordinate arrays. + + Purely used for testing purposes. + + Parameters + ---------- + index : intp_t + A flat index. + shape : numpy.ndarray[intp_t, ndim=1] + The shape of the array into which the flat indices should be converted. + + Returns + ------- + numpy.ndarray[intp_t, ndim=1] + A coordinate array having the same shape as the input `shape`. + """ + index = np.intp(index) + shape = np.array(shape, dtype=np.intp) + coords = np.empty(shape.shape[0], dtype=np.intp) + + cdef const intp_t[:] shape_memview = shape + cdef intp_t[:] coords_memview = coords + unravel_index_cython(index, shape_memview, coords_memview) + return coords + + +cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): + """Converts a tuple of coordinate arrays into a flat index. + + Purely used for testing purposes. + + Parameters + ---------- + coords : numpy.ndarray[intp_t, ndim=1] + An array of coordinate arrays to be converted. + shape : numpy.ndarray[intp_t, ndim=1] + The shape of the array into which the coordinates should be converted. + + Returns + ------- + intp_t + The resulting flat index. + + Raises + ------ + ValueError + If the input `coords` have invalid indices. + """ + return ravel_multi_index_cython(coords, shape) diff --git a/treeple/tree/kernels.py b/treeple/tree/kernels.py deleted file mode 100644 index 6aab826a9..000000000 --- a/treeple/tree/kernels.py +++ /dev/null @@ -1,12 +0,0 @@ -import numpy as np - - -def gaussian_kernel(shape, sigma=1.0, mu=0.0): - """N-dimensional gaussian kernel for the given shape. - - See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e - """ - m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape]) - d = np.sqrt(np.sum([x * x for x in m], axis=0)) - g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2))) - return g / np.sum(g) diff --git a/treeple/tree/manifold/_kernel_splitter.pxd b/treeple/tree/manifold/_kernel_splitter.pxd new file mode 100644 index 000000000..d0005d13c --- /dev/null +++ b/treeple/tree/manifold/_kernel_splitter.pxd @@ -0,0 +1,37 @@ +import numpy as np + +from libcpp.vector cimport vector + +from ..._lib.sklearn.tree._splitter cimport SplitRecord +from ..._lib.sklearn.utils._typedefs cimport ( + float32_t, + float64_t, + int32_t, + intp_t, + uint8_t, + uint32_t, +) +from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord +from ._morf_splitter cimport PatchSplitter + + +cdef class UserKernelSplitter(PatchSplitter): + """A class to hold user-specified kernels.""" + # cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels + cdef vector[float32_t*] kernel_dictionary # A list of C-contiguous 2D kernels + cdef vector[intp_t*] kernel_dims # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + + +cdef class GaussianKernelSplitter(PatchSplitter): + """A class to hold Gaussian kernels. + + Overrides the weights that are generated to be sampled from a Gaussian distribution. + See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus + See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567 + """ + + cdef void sample_proj_mat( + self, + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices + ) noexcept nogil diff --git a/treeple/tree/manifold/_kernel_splitter.pyx b/treeple/tree/manifold/_kernel_splitter.pyx new file mode 100644 index 000000000..ee8daf801 --- /dev/null +++ b/treeple/tree/manifold/_kernel_splitter.pyx @@ -0,0 +1,351 @@ +import numpy as np + +cimport numpy as cnp +from cython.operator cimport dereference as deref +from libcpp.vector cimport vector + +from ..._lib.sklearn.tree._criterion cimport Criterion +from ..._lib.sklearn.tree._utils cimport RAND_R_MAX, rand_int +from ..._lib.sklearn.utils._typedefs cimport float32_t, int32_t +from .._utils cimport ravel_multi_index_cython, unravel_index_cython + +ctypedef float32_t* float32_t_ptr +ctypedef intp_t* intp_t_ptr + + +cdef class Kernel2D: + cdef const intp_t[:] kernels_size + cdef const float32_t[:] kernel_data + cdef const int32_t[:] kernel_indices + cdef const int32_t[:] kernel_indptr + cdef uint32_t rand_r_state # sklearn_rand_r random number state + cdef object random_state # Random state + + cdef intp_t n_dims_largest + cdef intp_t n_kernels + + def __cinit__( + self, + object kernel_matrix, + cnp.ndarray kernel_sizes, + object random_state, + *argv + ): + """ + Initialize the class with a CSR sparse matrix and a 1D array of kernel sizes. + """ + self.kernels_size = kernel_sizes + self.kernel_data = kernel_matrix.data + self.kernel_indices = kernel_matrix.indices + self.kernel_indptr = kernel_matrix.indptr + self.n_kernels, self.n_dims_largest = kernel_matrix.shape + self.random_state = random_state + self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) + + cdef inline float32_t apply_kernel( + self, + float32_t[:, ::1] image, + intp_t kernel_idx + ) noexcept nogil: + """ + Apply a kernel to a random point on a 2D image and return the resulting value. + + Parameters: + ----------- + image : double[:, :] + The 2D image array (image_height, image_width). + kernel_idx : intp_t + The index of the kernel to apply. + + Returns: + -------- + result : double + The resulting value after applying the kernel. + """ + cdef uint32_t* random_state = &self.rand_r_state + cdef intp_t img_height = image.shape[0] + cdef intp_t img_width = image.shape[1] + cdef intp_t size = self.kernels_size[kernel_idx] + + # Ensure the kernel fits within the image dimensions + cdef intp_t max_x = img_width - 1 + cdef intp_t max_y = img_height - 1 + + # Sample a random top-left point to apply the kernel + cdef intp_t start_x = rand_int(0, max_x + 1, random_state) + cdef intp_t start_y = rand_int(0, max_y + 1, random_state) + + # Compute the result + cdef float32_t result = 0.0 + cdef intp_t idx, row, col + cdef float32_t image_value, kernel_value + + cdef intp_t start = self.kernel_indptr[kernel_idx] + cdef intp_t end = self.kernel_indptr[kernel_idx + 1] + + for idx in range(start, end): + row = self.kernel_indices[idx] // size + col = self.kernel_indices[idx] % size + + # Only process the kernel if it's within the image bounds + if image_value and kernel_value and (start_y + row <= max_y) and (start_x + col <= max_x): + image_value = image[start_y + row, start_x + col] + kernel_value = self.kernel_data[idx] + result += image_value * kernel_value + + return result + + def apply_kernel_py(self, float32_t[:, ::1] image, intp_t kernel_idx): + """ + Python-accessible wrapper for apply_kernel_nogil. + """ + return self.apply_kernel(image, kernel_idx) + + + + +# cdef class UserKernelSplitter(PatchSplitter): +# def __cinit__( +# self, +# criterion: Criterion, +# max_features: intp_t, +# min_samples_leaf: intp_t, +# min_weight_leaf: double, +# random_state: object, +# min_patch_dims: intp_t, +# max_patch_dims: intp_t, +# dim_contiguous: cnp.uint8_t, +# data_dims: intp_t, +# boundary: str, +# feature_weight: float32_t, +# kernel_dictionary: Kernel2D, +# *argv +# ): +# # initialize the kernel dictionary into a vector to allow Cython to see it +# # see: https://stackoverflow.com/questions/46240207/passing-list-of-numpy-arrays-to-c-using-cython +# cdef intp_t n_arrays = len(kernel_dictionary) +# self.kernel_dictionary = vector[float32_t_ptr](n_arrays) # A list of C-contiguous 2D kernels +# self.kernel_dims = vector[intp_t_ptr](n_arrays) # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + +# # buffers to point to each element in the list +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# cdef intp_t i +# for i in range(n_arrays): +# kernel = kernel_dictionary[i] +# kernel_dim = kernel_dims[i] + +# # store a pointer to the data +# self.kernel_dictionary.push_back(&kernel[0]) +# self.kernel_dims.push_back(&kernel_dim[0]) + +# cdef inline void compute_features_over_samples( +# self, +# intp_t start, +# intp_t end, +# const intp_t[:] samples, +# float32_t[:] feature_values, +# vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) +# vector[intp_t]* proj_vec_indices # indices of the features (max_features,) +# ) noexcept nogil: +# """Compute the feature values for the samples[start:end] range. + +# Returns -1 in case of failure to allocate memory (and raise MemoryError) +# or 0 otherwise. +# """ +# cdef intp_t idx, jdx +# cdef intp_t col_idx +# cdef float32_t col_weight + +# # Compute linear combination of features and then +# # sort samples according to the feature values. +# for jdx in range(0, proj_vec_indices.size()): +# col_idx = deref(proj_vec_indices)[jdx] +# col_weight = deref(proj_vec_weights)[jdx] + +# for idx in range(start, end): +# # initialize the feature value to 0 +# if jdx == 0: +# feature_values[idx] = 0.0 +# feature_values[idx] += self.X[samples[idx], col_idx] * col_weight + + +# cdef void sample_proj_mat( +# self, +# vector[vector[float32_t]]& proj_mat_weights, +# vector[vector[intp_t]]& proj_mat_indices +# ) noexcept nogil: +# """Sample projection matrix using a contiguous patch. +# Randomly sample patches with weight of 1. +# """ +# cdef intp_t max_features = self.max_features +# cdef intp_t proj_i + +# # define parameters for vectorized points in the original data shape +# # and top-left seed +# cdef intp_t top_left_patch_seed + +# # size of the sampled patch, which is just the size of the n-dim patch +# # (\prod_i self.patch_dims_buff[i]) +# cdef intp_t patch_size + +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# for proj_i in range(0, max_features): +# # now get the top-left seed that is used to then determine the top-left +# # position in patch +# # compute top-left seed for the multi-dimensional patch +# top_left_patch_seed, patch_size = self.sample_top_left_seed() + +# # sample a random index in the kernel library +# kernel_idx = rand_int(0, self.kernel_dictionary.size(), &self.rand_r_state) + +# # get that kernel and add it to the projection vector indices and weights +# kernel = self.kernel_dictionary[kernel_idx] +# kernel_dim = self.kernel_dims[kernel_idx] + +# # convert top-left-patch-seed to unraveled indices +# # get the top-left index in the original data +# top_left_idx = self.unravel_index(top_left_patch_seed, self.data_dims_buff, self.ndim) + +# # loop over the kernel and add the weights and indices to the projection +# for idim in range(self.ndim): +# # get the dimension of the kernel +# kernel_dim = self.kernel_dims[kernel_idx][idim] + +# # get the top-left index in the kernel +# top_left_kernel_idx = self.unravel_index(top_left_patch_seed, kernel_dim, self.ndim) + + # loop over the kernel and add the weights and indices to the projection + # for i in range(kernel_dim): + # # get the index in the original data + # idx = self.ravel_multi_index(top_left_idx, self.data_dims_buff, self.ndim) + + # # get the index in the kernel + # kernel_idx = self.ravel_multi_index(top_left_kernel_idx, kernel_dim, self.ndim) + + # # add the weight and index to the projection matrix + # proj_mat_weights[proj_i].push_back(kernel[kernel_idx]) + # proj_mat_indices[proj_i].push_back(idx) + + # # increment the top-left index in the original data + # top_left_idx[idim] += 1 + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += 1 + + # # increment the top-left index in the original data + # top_left_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + + +# cdef class UserKernelSplitter(PatchSplitter): +# def __cinit__( +# self, +# criterion: Criterion, +# max_features: intp_t, +# min_samples_leaf: intp_t, +# min_weight_leaf: double, +# random_state: object, +# min_patch_dims: intp_t, +# max_patch_dims: intp_t, +# dim_contiguous: cnp.uint8_t, +# data_dims: intp_t, +# boundary: str, +# feature_weight: float32_t, +# kernel_dictionary: Kernel2D, +# *argv +# ): +# # initialize the kernel dictionary into a vector to allow Cython to see it +# # see: https://stackoverflow.com/questions/46240207/passing-list-of-numpy-arrays-to-c-using-cython +# cdef intp_t n_arrays = len(kernel_dictionary) +# self.kernel_dictionary = vector[float32_t_ptr](n_arrays) # A list of C-contiguous 2D kernels +# self.kernel_dims = vector[intp_t_ptr](n_arrays) # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + +# # buffers to point to each element in the list +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# cdef intp_t i +# for i in range(n_arrays): +# kernel = kernel_dictionary[i] +# kernel_dim = kernel_dims[i] + +# # store a pointer to the data +# self.kernel_dictionary.push_back(&kernel[0]) +# self.kernel_dims.push_back(&kernel_dim[0]) + +# cdef void sample_proj_mat( +# self, +# vector[vector[float32_t]]& proj_mat_weights, +# vector[vector[intp_t]]& proj_mat_indices +# ) noexcept nogil: +# """Sample projection matrix using a contiguous patch. +# Randomly sample patches with weight of 1. +# """ +# cdef intp_t max_features = self.max_features +# cdef intp_t proj_i + +# # define parameters for vectorized points in the original data shape +# # and top-left seed +# cdef intp_t top_left_patch_seed + +# # size of the sampled patch, which is just the size of the n-dim patch +# # (\prod_i self.patch_dims_buff[i]) +# cdef intp_t patch_size + +# cdef float32_t[:] kernel +# cdef intp_t[:] kernel_dim + +# for proj_i in range(0, max_features): +# # now get the top-left seed that is used to then determine the top-left +# # position in patch +# # compute top-left seed for the multi-dimensional patch +# top_left_patch_seed, patch_size = self.sample_top_left_seed() + +# # sample a random index in the kernel library +# # kernel_idx = + +# # get that kernel and add it to the projection vector indices and weights +# kernel = self.kernel_dictionary[kernel_idx] +# kernel_dim = self.kernel_dims[kernel_idx] + +# # convert top-left-patch-seed to unraveled indices +# # get the top-left index in the original data +# top_left_idx = self.unravel_index(top_left_patch_seed, self.data_dims_buff, self.ndim) + +# # loop over the kernel and add the weights and indices to the projection +# for idim in range(self.ndim): +# # get the dimension of the kernel +# kernel_dim = self.kernel_dims[kernel_idx][idim] + +# # get the top-left index in the kernel +# top_left_kernel_idx = self.unravel_index(top_left_patch_seed, kernel_dim, self.ndim) + +# # loop over the kernel and add the weights and indices to the projection +# # for i in range(kernel_dim): +# # # get the index in the original data +# # idx = self.ravel_multi_index(top_left_idx, self.data_dims_buff, self.ndim) + +# # # get the index in the kernel +# # kernel_idx = self.ravel_multi_index(top_left_kernel_idx, kernel_dim, self.ndim) + +# # # add the weight and index to the projection matrix +# # proj_mat_weights[proj_i].push_back(kernel[kernel_idx]) +# # proj_mat_indices[proj_i].push_back(idx) + +# # # increment the top-left index in the original data +# # top_left_idx[idim] += 1 + +# # # increment the top-left index in the kernel +# # top_left_kernel_idx[idim] += 1 + +# # # increment the top-left index in the original data +# # top_left_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + +# # # increment the top-left index in the kernel +# # top_left_kernel_idx[idim] += self.patch_dims_buff[idim] - kernel_dim diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd index 2b65fd3ba..11e5ef9c3 100644 --- a/treeple/tree/manifold/_morf_splitter.pxd +++ b/treeple/tree/manifold/_morf_splitter.pxd @@ -33,11 +33,14 @@ cdef class PatchSplitter(BestObliqueSplitter): # `data_width` are used to determine the vectorized indices corresponding to # (x,y) coordinates in the original un-vectorized data. cdef public intp_t ndim # The number of dimensions of the input data + cdef const intp_t[:] data_dims # The dimensions of the input data + cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension + cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension + cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous - cdef const intp_t[:] data_dims # The dimensions of the input data - cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension - cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension - cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous + # TODO: assumes all oblique splitters only work with dense data + cdef public memoryview X_reshaped + cdef memoryview patch_nd_indices # TODO: check if this works and is necessary for discontiguous data # cdef intp_t[:] stride_offsets # The stride offsets for each dimension @@ -48,13 +51,13 @@ cdef class PatchSplitter(BestObliqueSplitter): cdef intp_t[::1] _index_data_buffer cdef intp_t[::1] _index_patch_buffer - cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch + cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix # to quickly obtain the sampled projections for candidate splits. - cdef (intp_t, intp_t) sample_top_left_seed( + cdef intp_t sample_top_left_seed( self ) noexcept nogil @@ -63,23 +66,3 @@ cdef class PatchSplitter(BestObliqueSplitter): vector[vector[float32_t]]& proj_mat_weights, vector[vector[intp_t]]& proj_mat_indices ) noexcept nogil - - -# cdef class UserKernelSplitter(PatchSplitter): -# """A class to hold user-specified kernels.""" -# cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels - - -cdef class GaussianKernelSplitter(PatchSplitter): - """A class to hold Gaussian kernels. - - Overrides the weights that are generated to be sampled from a Gaussian distribution. - See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus - See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567 - """ - - cdef void sample_proj_mat( - self, - vector[vector[float32_t]]& proj_mat_weights, - vector[vector[intp_t]]& proj_mat_indices - ) noexcept nogil diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index f1eaf2918..dc4dffdf5 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -5,14 +5,40 @@ # cython: wraparound=False # cython: initializedcheck=False +cimport numpy as cnp + import numpy as np from cython.operator cimport dereference as deref from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion +from ..._lib.sklearn.tree._tree cimport ParentInfo from ..._lib.sklearn.tree._utils cimport rand_int -from .._utils cimport ravel_multi_index_cython, unravel_index_cython +from .._sklearn_splitter cimport sort +from .._utils cimport ( + compute_vectorized_indices_from_cartesian, + fisher_yates_shuffle, + init_2dmemoryview, + ravel_multi_index_cython, + unravel_index_cython, +) + + +cdef float64_t INFINITY = np.inf + +# Mitigate precision differences between 32 bit and 64 bit +cdef float32_t FEATURE_THRESHOLD = 1e-7 + +cdef inline void _init_split(ObliqueSplitRecord* self, intp_t start_pos) noexcept nogil: + self.impurity_left = INFINITY + self.impurity_right = INFINITY + self.pos = start_pos + self.feature = 0 + self.threshold = 0. + self.improvement = -INFINITY + self.missing_go_to_left = False + self.n_missing = 0 cdef class PatchSplitter(BestObliqueSplitter): @@ -33,46 +59,11 @@ cdef class PatchSplitter(BestObliqueSplitter): const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, ) except -1: + # store a view to the (n_samples, n_features) dataset BestObliqueSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - return 0 - - cdef int node_reset( - self, - intp_t start, - intp_t end, - float64_t* weighted_n_node_samples - ) except -1 nogil: - """Reset splitter on node samples[start:end]. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - start : intp_t - The index of the first sample to consider - end : intp_t - The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=float64_t pointer - The total weight of those samples - """ - - self.start = start - self.end = end - - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples) - self.criterion.set_sample_pointers(start, end) - - weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples - - # Clear all projection vectors - for i in range(self.max_features): - self.proj_mat_weights[i].clear() - self.proj_mat_indices[i].clear() + # store a reshaped view of (n_samples, height, width) dataset + self.X_reshaped = init_2dmemoryview(X, self.data_dims) return 0 cdef void sample_proj_mat( @@ -87,32 +78,11 @@ cdef class PatchSplitter(BestObliqueSplitter): """ pass - cdef (intp_t, intp_t) sample_top_left_seed(self) noexcept nogil: + cdef intp_t sample_top_left_seed(self) noexcept nogil: pass -cdef class BaseDensePatchSplitter(PatchSplitter): - cdef int init( - self, - object X, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - const uint8_t[::1] missing_values_in_feature_mask, - # const int32_t[:] n_categories - ) except -1: - """Initialize the splitter - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - # Call parent init - PatchSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - - self.X = X - return 0 - - -cdef class BestPatchSplitter(BaseDensePatchSplitter): +cdef class BestPatchSplitter(PatchSplitter): def __cinit__( self, Criterion criterion, @@ -159,6 +129,19 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.max_patch_dims = max_patch_dims self.dim_contiguous = dim_contiguous + # create a memoryview to store the n-dimensional indices + self.patch_nd_indices = np.zeros(self.max_patch_dims[:], dtype=np.intp) + + # store random indices for discontiguous sampling if necessary + # these are initialized here to allow fisher-yates shuffling without having + # to re-allocate memory + self.random_indices = vector[vector[intp_t]](self.ndim) + for idx in range(self.ndim): + if not self.dim_contiguous[idx]: + self.random_indices[idx].reserve(self.max_patch_dims[idx]) + for jdx in range(self.max_patch_dims[idx]): + self.random_indices[idx].push_back(jdx) + # initialize a buffer to allow for Fisher-Yates self._index_patch_buffer = np.zeros(np.max(self.max_patch_dims), dtype=np.intp) self._index_data_buffer = np.zeros(np.max(self.data_dims), dtype=np.intp) @@ -192,31 +175,178 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.feature_weight.base if self.feature_weight is not None else None, ), self.__getstate__()) - cdef (intp_t, intp_t) sample_top_left_seed(self) noexcept nogil: - """Sample the top-left seed for the n-dim patch. + cdef int node_split( + self, + ParentInfo* parent_record, + SplitRecord* split, + ) except -1 nogil: + """Find the best_split split on node samples[start:end] + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # typecast the pointer to an ObliqueSplitRecord + cdef ObliqueSplitRecord* oblique_split = (split) + + # Draw random splits and pick the best_split + cdef intp_t[::1] samples = self.samples + cdef intp_t start = self.start + cdef intp_t end = self.end + + # pointer array to store feature values to split on + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t max_features = self.max_features + cdef intp_t min_samples_leaf = self.min_samples_leaf + + # keep track of split record for current_split node and the best_split split + # found among the sampled projection vectors + cdef ObliqueSplitRecord best_split, current_split + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY + + cdef float64_t impurity = parent_record.impurity + + cdef intp_t feat_i, p # index over computed features and start/end + cdef intp_t partition_end + cdef float32_t temp_d # to compute a projection feature value + + # instantiate the split records + _init_split(&best_split, end) + + # For every vector in the projection matrix + for feat_i in range(max_features): + # Projection vector has no nonzeros + if self.proj_mat_weights[feat_i].empty(): + continue + + # XXX: 'feature' is not actually used in oblique split records + # Just indicates which split was sampled + current_split.feature = feat_i + + # sample the projection vector + self.sample_proj_vec(self.proj_mat_weights[feat_i], self.proj_mat_indices[feat_i]) + current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] + current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] + + # Compute linear combination of features and then + # sort samples according to the feature values. + self.compute_features_over_samples( + start, + end, + samples, + feature_values, + &self.proj_mat_weights[feat_i], + &self.proj_mat_indices[feat_i] + ) + + # Sort the samples + sort(&feature_values[start], &samples[start], end - start) + + # Evaluate all splits + self.criterion.reset() + p = start + while p < end: + while (p + 1 < end and feature_values[p + 1] <= feature_values[p] + FEATURE_THRESHOLD): + p += 1 + + p += 1 + + if p < end: + current_split.pos = p + + # Reject if min_samples_leaf is not guaranteed + if (((current_split.pos - start) < min_samples_leaf) or + ((end - current_split.pos) < min_samples_leaf)): + continue + + self.criterion.update(current_split.pos) + # Reject if min_weight_leaf is not satisfied + if self.check_postsplit_conditions() == 1: + continue + + current_proxy_improvement = \ + self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + # sum of halves is used to avoid infinite value + current_split.threshold = feature_values[p - 1] / 2.0 + feature_values[p] / 2.0 + + if ( + (current_split.threshold == feature_values[p]) or + (current_split.threshold == INFINITY) or + (current_split.threshold == -INFINITY) + ): + current_split.threshold = feature_values[p - 1] + + best_split = current_split # copy + + # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] + if best_split.pos < end: + partition_end = end + p = start + + while p < partition_end: + # Account for projection vector + temp_d = 0.0 + for j in range(best_split.proj_vec_indices.size()): + temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ + deref(best_split.proj_vec_weights)[j] + + if temp_d <= best_split.threshold: + p += 1 + + else: + partition_end -= 1 + samples[p], samples[partition_end] = \ + samples[partition_end], samples[p] + + self.criterion.reset() + self.criterion.update(best_split.pos) + self.criterion.children_impurity(&best_split.impurity_left, + &best_split.impurity_right) + best_split.improvement = self.criterion.impurity_improvement( + impurity, best_split.impurity_left, best_split.impurity_right) + + # Return values + deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices + deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + deref(oblique_split).feature = best_split.feature + deref(oblique_split).pos = best_split.pos + deref(oblique_split).threshold = best_split.threshold + deref(oblique_split).improvement = best_split.improvement + deref(oblique_split).impurity_left = best_split.impurity_left + deref(oblique_split).impurity_right = best_split.impurity_right + + # XXX: Fix when we can track constants + parent_record.n_constant_features = 0 + return 0 + + cdef inline intp_t sample_top_left_seed( + self + ) noexcept nogil: + """Sample the top-left seed, and patch size for the n-dim patch. Returns ------- top_left_seed : intp_t The top-left seed vectorized (i.e. raveled) for the n-dim patch. - patch_size : intp_t - The total size of the n-dim patch (i.e. the volume). """ # now get the top-left seed that is used to then determine the top-left # position in patch # compute top-left seed for the multi-dimensional patch cdef intp_t top_left_patch_seed - cdef intp_t patch_size = 1 cdef uint32_t* random_state = &self.rand_r_state # define parameters for the random patch cdef intp_t patch_dim cdef intp_t delta_patch_dim - cdef intp_t dim - cdef intp_t idx + + # create a vector to store the unraveled patch top left seed point + cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim) for idx in range(self.ndim): # compute random patch width and height @@ -238,7 +368,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # write to buffer self.patch_sampled_size[idx] = patch_dim - patch_size *= patch_dim elif self.boundary == "wrap": # add circular boundary conditions delta_patch_dim = self.data_dims[idx] + 2 * (patch_dim - 1) @@ -252,19 +381,93 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # resample the patch dimension due to padding patch_dim = min(patch_dim, min(dim+1, self.data_dims[idx] + patch_dim - dim - 1)) self.patch_sampled_size[idx] = patch_dim - patch_size *= patch_dim # TODO: make this work # Convert the top-left-seed value to it's appropriate index in the full image. top_left_patch_seed = max(0, dim - patch_dim + 1) + # unraveled_patch_point[idx] = top_left_patch_seed self.unraveled_patch_point[idx] = top_left_patch_seed - + + # top_left_patch_seed = ravel_multi_index_cython( + # unraveled_patch_point, + # self.data_dims + # ) top_left_patch_seed = ravel_multi_index_cython( self.unraveled_patch_point, self.data_dims ) - return top_left_patch_seed, patch_size + return top_left_patch_seed + + + cdef inline intp_t sample_top_left_seed_v2( + self + ) noexcept nogil: + """Sample the top-left seed, and patch size for the n-dim patch. + + Returns + ------- + top_left_seed : intp_t + The top-left seed vectorized (i.e. raveled) for the n-dim patch. + """ + # the top-left position in the patch for each dimension + cdef intp_t top_left_patch_seed + + cdef uint32_t* random_state = &self.rand_r_state + + # define parameters for the random patch + cdef intp_t patch_dim + cdef intp_t delta_patch_dim + cdef intp_t dim + + # pre-allocated buffer to store the unraveled patch top left seed point + cdef intp_t[:] unraveled_patch_point = self.unraveled_patch_point + + for dim in range(self.ndim): + # compute random patch width and height + # Note: By constraining max patch height/width to be at least the min + # patch height/width this ensures that the minimum value of + # patch_height and patch_width is 1 + patch_dim = rand_int( + self.min_patch_dims[dim], + self.max_patch_dims[dim] + 1, + random_state + ) + + if not self.dim_contiguous[dim]: + # fisher-yates shuffle the discontiguous dimension, so we can randomly sample + # without replacement, the indices of the patch in that dimension + fisher_yates_shuffle(self.random_indices[dim], self.random_indices[dim].size(), random_state) + + # sample the top-left index and patch size for this dimension based on boundary effects + if self.boundary is None: + # compute the difference between the image dimensions and the current + # random patch dimensions for sampling + delta_patch_dim = (self.data_dims[dim] - patch_dim) + 1 + top_left_patch_seed = rand_int(0, delta_patch_dim, random_state) + + # write to buffer + self.patch_sampled_size[dim] = patch_dim + elif self.boundary == "wrap": + pass + + + # now add the relevant indices for each element of the patch + for jdx in range(patch_dim): + if self.dim_contiguous[dim]: + self.patch_nd_indices[dim][jdx] = top_left_patch_seed + jdx + else: + # if discontiguous, we will perform random sampling + self.patch_nd_indices[dim][jdx] = self.random_indices[dim][jdx] + + unraveled_patch_point[dim] = top_left_patch_seed + + # get the vectorized index of the top-left seed + top_left_patch_seed = ravel_multi_index_cython( + unraveled_patch_point, + self.data_dims + ) + return top_left_patch_seed cdef void sample_proj_mat( self, @@ -282,22 +485,17 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # and top-left seed cdef intp_t top_left_patch_seed - # size of the sampled patch, which is just the size of the n-dim patch - # (\prod_i self.patch_sampled_size[i]) - cdef intp_t patch_size - for proj_i in range(0, max_features): # now get the top-left seed that is used to then determine the top-left # position in patch # compute top-left seed for the multi-dimensional patch - top_left_patch_seed, patch_size = self.sample_top_left_seed() + top_left_patch_seed = self.sample_top_left_seed() # sample a projection vector given the top-left seed point in n-dimensional space self.sample_proj_vec( proj_mat_weights, proj_mat_indices, proj_i, - patch_size, top_left_patch_seed, self.patch_sampled_size ) @@ -307,7 +505,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): vector[vector[float32_t]]& proj_mat_weights, vector[vector[intp_t]]& proj_mat_indices, intp_t proj_i, - intp_t patch_size, intp_t top_left_patch_seed, const intp_t[:] patch_dims, ) noexcept nogil: @@ -315,6 +512,14 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # iterates over the size of the patch cdef intp_t patch_idx + cdef intp_t i + + # size of the sampled patch, which is just the size of the n-dim patch + # (\prod_i self.patch_sampled_size[i]) + cdef intp_t patch_size = 1 + for i in range(0, self.ndim): + patch_size *= patch_dims[i] + # stores how many patches we have iterated so far cdef intp_t vectorized_patch_offset cdef intp_t vectorized_point_offset @@ -330,7 +535,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef intp_t other_dims_offset cdef intp_t row_index - cdef intp_t i cdef intp_t num_rows = self.data_dims[0] if self._discontiguous: # fill with values 0, 1, ..., dimension - 1 @@ -401,6 +605,166 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): proj_mat_indices[proj_i].push_back(vectorized_point) proj_mat_weights[proj_i].push_back(weight) + cdef void sample_proj_vec_v2( + self, + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices, + vector[intp_t]& top_left_seed, + intp_t proj_i, + const intp_t[:] patch_dims, + ) noexcept nogil: + cdef uint32_t* random_state = &self.rand_r_state + + # iterates over the size of the patch + cdef intp_t patch_idx + + # stores how many patches we have iterated so far + cdef intp_t vectorized_patch_offset + cdef intp_t vectorized_point_offset + cdef intp_t vectorized_point + + cdef intp_t dim_idx + + # weights are default to 1 + cdef float32_t weight = 1. + + # XXX: still unsure if it works yet + # XXX: THIS ONLY WORKS FOR THE FIRST DIMENSION THAT IS DISCONTIGUOUS. + cdef intp_t other_dims_offset + cdef intp_t row_index + + cdef intp_t i + cdef intp_t num_rows = self.data_dims[0] + + # iterate over the patch and store the vectorized index + cdef vector[intp_t] index_arr = vector[intp_t](self.ndim) + for dim in range(top_left_seed.size()): + index_arr[dim] = top_left_seed[dim] + + # fisher_yates_shuffle(dim_index_arr, patch_dims[dim], random_state) + + # iterate over the indices of the patch, and get the resulting + # raveled index at each patch-point, resulting in now a 1D vector + # of indices representing each patch-point. + while True: + # Note: we iterate over the last axis first assuming that data + # is stored in a C-contiguous array. + for dim in range(self.ndim - 1, -1, -1): + if self.dim_contiguous[dim]: + index_arr[dim] += 1 + + # check we have not reached the boundaries of the patch. If we have not, + # we can break out of the loop and continue iterating in this dimension + if index_arr[dim] < top_left_seed[dim] + patch_dims[dim]: + break + + # if we have reached the boundary, we reset the index and continue + index_arr[dim] = top_left_seed[dim] + else: + random_index = random_indices[dim][r_idx] + # dimension is assumed discontiguous, and thus is randomly chosen + index_arr[dim] = rand_int(0, self.data_dims[dim], random_state) + + discontiguous_index += 1 + + if r_idx < patch_dims[dim]: + idx[dim] = random_indices[dim][r_idx] + r_idx += 1 + break + + r_idx = 0 + # if we have reached the boundary, we reset the index and continue + index_arr[dim] = top_left_seed[dim] + + # if self.dim_contiguous[dim] and + # break + # elif not self.dim_contiguous[dim] and index > patch_dims[dim]: + # break + else: + break + + # # get the vectorized version of this index + # # dim_idx = ravel_multi_index_cython( + # # index_arr, + # # self.data_dims + # # ) + + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + + # idx[dim] = start[dim] + # TODO: iterate correctly over multi-dim array + # for idx in range(len(self.data_dims[dim])): + # unvectorized_idx[dim] = top_left_seed[dim] + idx + # else: + # unvectorized_idx[dim] = top_left_seed[dim] + rand_int(0, patch_dims[dim], random_state) + + # # get the vectorized version of this index + # dim_idx = ravel_multi_index_cython( + # unvectorized_idx, + # self.data_dims + # ) + + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + + + # for patch_idx in range(patch_size): + # # keep track of which dimensions of the patch we have iterated over + # vectorized_patch_offset = 1 + + # # Once the vectorized top-left-seed is unraveled, you can add the patch + # # points in the array structure and compute their vectorized (unraveled) + # # points, which are added to the projection vector + # unravel_index_cython(top_left_patch_seed, self.data_dims, self.unraveled_patch_point) + + # for dim_idx in range(self.ndim): + # # compute the offset from the top-left patch seed based on: + # # 1. the current patch index + # # 2. the patch dimension indexed by `dim_idx` + # # 3. and the vectorized patch dimensions that we have seen so far + # # the `vectorized_point_offset` is the offset from the top-left vectorized seed for this dimension + # vectorized_point_offset = (patch_idx // (vectorized_patch_offset)) % patch_dims[dim_idx] + + # # then we compute the actual point in the original data shape + # self.unraveled_patch_point[dim_idx] = self.unraveled_patch_point[dim_idx] + vectorized_point_offset + # vectorized_patch_offset *= patch_dims[dim_idx] + + # # if any dimensions are discontiguous, we want to migrate the entire axis a fixed amount + # # based on the shuffling + # if self._discontiguous is True: + # for dim_idx in range(self.ndim): + # if self.dim_contiguous[dim_idx] is True: + # continue + + # # determine the "row" we are currently on + # # other_dims_offset = 1 + # # for idx in range(dim_idx + 1, self.ndim): + # # other_dims_offset *= self.data_dims[idx] + # # row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset + # # determine the "row" we are currently on + # other_dims_offset = 1 + # for idx in range(dim_idx + 1, self.ndim): + # if not self.dim_contiguous[idx]: + # other_dims_offset *= self.data_dims[idx] + + # row_index = 0 + # for idx in range(dim_idx + 1, self.ndim): + # if not self.dim_contiguous[idx]: + # row_index += ( + # (self.unraveled_patch_point[idx] // other_dims_offset) % + # self.patch_sampled_size[idx] + # ) * other_dims_offset + # other_dims_offset //= self.data_dims[idx] + + # # assign random row index now + # self.unraveled_patch_point[dim_idx] = self._index_patch_buffer[row_index] + + # # ravel the patch point into the original data dimensions + # vectorized_point = ravel_multi_index_cython(self.unraveled_patch_point, self.data_dims) + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + cdef void compute_features_over_samples( self, intp_t start, @@ -408,18 +772,62 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): const intp_t[:] samples, float32_t[:] feature_values, vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + + Parameters + ---------- + start : int + The start index of the samples. + end : int + The end index of the samples. + samples : array-like, shape (n_samples,) + The indices of the samples. + feature_values : array-like, shape (n_samples,) + The pre-allocated buffer to store the feature values. + proj_vec_weights : array-like, shape (max_features,) + The weights of the projection vector. + proj_vec_indices : array-like, shape (max_features,) + The indices of the projection vector. This will only store the vectorized + index of the top-left-seed. """ cdef intp_t idx, jdx # initialize feature weight to normalize across patch cdef float32_t patch_weight + if proj_vec_indices.size() != 1: + with gil: + raise ValueError("proj_vec_indices should only have one element corresponding to the top left seed.") + + # patch dimensions + cdef intp_t[:] patch_dims = self.patch_sampled_size + cdef vector[intp_t] top_left_seed = vector[intp_t](self.ndim) + + cdef intp_t volume_of_patch = 1 + # Calculate the total number of Cartesian products + for i in range(self.ndim): + volume_of_patch *= len(self.patch_nd_indices[i]) + + # create a buffer to store the raveled indices of the patch, which will be used + # to compute the relevant feature values + cdef vector[intp_t] raveled_patch_indices = vector[intp_t](volume_of_patch) + for jdx in range(0, proj_vec_indices.size()): + # get the index of the top-left patch + top_left_patch_index = deref(proj_vec_indices)[jdx] + + # compute the raveled index of all the points in the patch + compute_vectorized_indices_from_cartesian( + top_left_patch_index, + self.patch_nd_indices, + self.data_dims, + raveled_patch_indices + ) + # Compute linear combination of features and then # sort samples according to the feature values. for idx in range(start, end): @@ -427,15 +835,17 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # initialize the feature value to 0 feature_values[idx] = 0 - for jdx in range(0, proj_vec_indices.size()): + for kdx in range(raveled_patch_indices.size()): + feature_index = raveled_patch_indices[kdx] + feature_values[idx] += self.X[ - samples[idx], deref(proj_vec_indices)[jdx] + samples[idx], feature_index ] * deref(proj_vec_weights)[jdx] - if self.feature_weight is not None: - # gets the feature weight for this specific column from X - # the default of feature_weights[i] is (1/n_features) for all i - patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] + if self.feature_weight is not None: + # gets the feature weight for this specific column from X + # the default of feature_weights[i] is (1/n_features) for all i + patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] if self.feature_weight is not None: feature_values[idx] /= patch_weight @@ -464,7 +874,6 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): proj_mat_weights, proj_mat_indices, proj_i, - patch_size, top_left_patch_seed, patch_dims ) diff --git a/treeple/tree/manifold/meson.build b/treeple/tree/manifold/meson.build index 867da8eaa..6b900aec6 100644 --- a/treeple/tree/manifold/meson.build +++ b/treeple/tree/manifold/meson.build @@ -2,6 +2,9 @@ tree_extension_metadata = { '_morf_splitter': {'sources': ['_morf_splitter.pyx'], 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_kernel_splitter': + {'sources': ['_kernel_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, } foreach ext_name, ext_dict : tree_extension_metadata diff --git a/treeple/tree/meson.build b/treeple/tree/meson.build index 2febbbc6a..c77266a4c 100644 --- a/treeple/tree/meson.build +++ b/treeple/tree/meson.build @@ -36,6 +36,7 @@ python_sources = [ '_neighbors.py', '_honest_tree.py', '_marginalize.py', + '_kernel.py', ] py.install_sources( diff --git a/treeple/tree/tests/meson.build b/treeple/tree/tests/meson.build index 552eb7356..cbdd38f71 100644 --- a/treeple/tree/tests/meson.build +++ b/treeple/tree/tests/meson.build @@ -7,6 +7,8 @@ python_sources = [ 'test_all_trees.py', 'test_unsupervised_tree.py', 'test_multiview.py', + 'test_kernel.py', + 'test_manifold.py', ] py.install_sources( diff --git a/treeple/tree/tests/test_kernel.py b/treeple/tree/tests/test_kernel.py new file mode 100644 index 000000000..6e18b276c --- /dev/null +++ b/treeple/tree/tests/test_kernel.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from treeple.tree._kernel import sample_gaussian_kernels + + +def gaussian_2d(x, y, mu, sigma): + """Generate the 2D Gaussian value at point (x, y). + + Used to generate the expected Gaussian kernel for testing. + """ + return np.exp(-((x - mu) ** 2 + (y - mu) ** 2) / (2 * sigma**2)) + + +@pytest.mark.parametrize( + "n_kernels, min_size, max_size, mu, sigma", + [ + (1, 3, 3, 0.0, 1.0), # Single fixed kernel + (2, 3, 5, 0.5, 1.1), # Two fixed-size kernels with varying sigma + (3, 3, 5, (0.0, 0.5), (0.1, 1.0)), # Varying sizes, fixed sigma + ], +) +def test_sample_gaussian_kernels(n_kernels, min_size, max_size, mu, sigma): + kernel_matrix, kernel_params = sample_gaussian_kernels(n_kernels, min_size, max_size, mu, sigma) + + # Ensure kernel_matrix and kernel_sizes have correct lengths + assert kernel_matrix.shape[0] == n_kernels + assert kernel_params["size"].shape[0] == n_kernels + + for i in range(n_kernels): + size = kernel_params["size"][i] + mu_sample = kernel_params["mu"][i] + sigma_sample = kernel_params["sigma"][i] + + # Extract and reshape the kernel + start_idx = kernel_matrix.indptr[i] + end_idx = kernel_matrix.indptr[i + 1] + kernel_vector = kernel_matrix.data[start_idx:end_idx] + kernel = kernel_vector.reshape(size, size) + + # Generate the expected Gaussian kernel + x = np.linspace(-1, 1, size) + y = np.linspace(-1, 1, size) + X, Y = np.meshgrid(x, y) + expected_kernel = gaussian_2d(X, Y, mu_sample, sigma_sample) + + # Normalize the expected kernel for comparison + expected_kernel /= expected_kernel.sum() + kernel /= kernel.sum() + + # Check that the kernel matches the expected Gaussian distribution + np.testing.assert_allclose(kernel, expected_kernel, rtol=1e-2, atol=1e-2) diff --git a/treeple/tree/tests/test_manifold.py b/treeple/tree/tests/test_manifold.py new file mode 100644 index 000000000..bdd53b821 --- /dev/null +++ b/treeple/tree/tests/test_manifold.py @@ -0,0 +1,83 @@ +import numpy as np +from numpy.testing import assert_array_equal +from sklearn import datasets + +from treeple.tree import ( + ExtraObliqueDecisionTreeClassifier, + ExtraObliqueDecisionTreeRegressor, + ObliqueDecisionTreeClassifier, + ObliqueDecisionTreeRegressor, + PatchObliqueDecisionTreeClassifier, + PatchObliqueDecisionTreeRegressor, + UnsupervisedDecisionTree, + UnsupervisedObliqueDecisionTree, +) + +ALL_TREES = [ + ExtraObliqueDecisionTreeRegressor, + ObliqueDecisionTreeRegressor, + PatchObliqueDecisionTreeRegressor, + ExtraObliqueDecisionTreeClassifier, + ObliqueDecisionTreeClassifier, + PatchObliqueDecisionTreeClassifier, + UnsupervisedDecisionTree, + UnsupervisedObliqueDecisionTree, +] + +rng = np.random.RandomState(1) + +# load digits dataset and randomly permute it +digits = datasets.load_digits() +perm = rng.permutation(digits.target.size) +digits.data = digits.data[perm] +digits.target = digits.target[perm] + + +def test_splitters(): + """Test that splitters are picklable.""" + X, y = digits.data, digits.target + X = X.astype(np.float32) + sample_weight = np.ones(len(y), dtype=np.float64).squeeze() + y = y.reshape(-1, 1).astype(np.float64) + # missing_values_in_feature_mask = None + + print(X.shape, y.shape) + from treeple._lib.sklearn.tree._criterion import Gini + from treeple.tree.manifold._morf_splitter import BestPatchSplitterTester + + criterion = Gini(1, np.array((0, 1), dtype=np.intp)) + max_features = 6 + min_samples_leaf = 1 + min_weight_leaf = 0.0 + monotonic_cst = np.array([1, 1, 1, 1, 1, 1], dtype=np.int8) + random_state = np.random.RandomState(100) + boundary = None + feature_weight = None + min_patch_dims = np.array((1, 1), dtype=np.intp) + max_patch_dims = np.array((3, 1), dtype=np.intp) + dim_contiguous = np.array((True, True)) + data_dims = np.array((8, 8), dtype=np.intp) + + feature_combinations = 1.5 + + splitter = BestPatchSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, + min_patch_dims, + max_patch_dims, + dim_contiguous, + data_dims, + boundary, + feature_weight, + ) + splitter.init_test( + X, + y, + sample_weight, + ) + assert_array_equal(splitter.X_reshaped.reshape(-1, 64), X)