diff --git a/build_requirements.txt b/build_requirements.txt index ddc179f56..3341505a4 100644 --- a/build_requirements.txt +++ b/build_requirements.txt @@ -3,7 +3,7 @@ meson-python cython==0.29.36 ninja numpy -scikit-learn>=1.3 +scikit-learn>=1.3.1 click rich-click doit diff --git a/pyproject.toml b/pyproject.toml index d9d901e32..98f6591d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ include = [ dependencies = [ 'numpy', 'scipy>=1.5.0', - 'scikit-learn>=1.3' + 'scikit-learn>=1.3.1' ] diff --git a/requirements.txt b/requirements.txt index 92f3a6b2b..dc42ea7e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy>=1.25 scipy -scikit-learn>=1.3 +scikit-learn>=1.3.1 diff --git a/sktree/tree/_marginal.pyx b/sktree/tree/_marginal.pyx index 2aae3b021..0510f8883 100644 --- a/sktree/tree/_marginal.pyx +++ b/sktree/tree/_marginal.pyx @@ -140,7 +140,7 @@ cdef inline cnp.ndarray _apply_dense_marginal( cdef float32_t X_i_node_feature cdef float32_t n_node_samples, n_right_samples, n_left_samples - cdef double p_left + cdef float64_t p_left cdef intp_t is_left # Initialize output @@ -172,7 +172,7 @@ cdef inline cnp.ndarray _apply_dense_marginal( n_right_samples = tree.nodes[node.right_child].n_node_samples # compute the probabilies for going left and right - p_left = (n_left_samples / n_node_samples) + p_left = (n_left_samples / n_node_samples) # randomly sample a direction is_left = rand_weighted_binary(p_left, rand_r_state) diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 18fc125c0..dc0137d05 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -25,10 +25,10 @@ cdef struct ObliqueSplitRecord: intp_t pos # Split samples array at the given position, # # i.e. count of samples below threshold for feature. # # pos is >= end if the node is a leaf. - double threshold # Threshold to split at. - double improvement # Impurity improvement given parent node. - double impurity_left # Impurity of the left split. - double impurity_right # Impurity of the right split. + float64_t threshold # Threshold to split at. + float64_t improvement # Impurity improvement given parent node. + float64_t impurity_left # Impurity of the left split. + float64_t impurity_right # Impurity of the right split. vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) vector[intp_t]* proj_vec_indices # indices of the features (max_features,) @@ -62,7 +62,7 @@ cdef class BaseObliqueSplitter(Splitter): self, intp_t start, intp_t end, - double* weighted_n_node_samples + float64_t* weighted_n_node_samples ) except -1 nogil cdef void compute_features_over_samples( @@ -77,11 +77,11 @@ cdef class BaseObliqueSplitter(Splitter): cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil cdef inline void fisher_yates_shuffle_memview( @@ -96,7 +96,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # to split the samples samples[start:end]. # Oblique Splitting extra parameters - cdef public double feature_combinations # Number of features to combine + cdef public float64_t feature_combinations # Number of features to combine cdef intp_t n_non_zeros # Number of non-zero features cdef intp_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features @@ -113,11 +113,11 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef class BestObliqueSplitter(ObliqueSplitter): cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil @@ -131,16 +131,16 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): cdef intp_t partition_samples( self, - double current_threshold + float64_t current_threshold ) noexcept nogil cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index efbd5d6b2..8ab51629b 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -13,7 +13,7 @@ from sklearn.tree._utils cimport rand_int, rand_uniform from .._lib.sklearn.tree._criterion cimport Criterion -cdef double INFINITY = np.inf +cdef float64_t INFINITY = np.inf # Mitigate precision differences between 32 bit and 64 bit cdef float32_t FEATURE_THRESHOLD = 1e-7 @@ -45,7 +45,7 @@ cdef class BaseObliqueSplitter(Splitter): pass cdef intp_t node_reset(self, intp_t start, intp_t end, - double* weighted_n_node_samples) except -1 nogil: + float64_t* weighted_n_node_samples) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -57,7 +57,7 @@ cdef class BaseObliqueSplitter(Splitter): The index of the first sample to consider end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ @@ -144,10 +144,10 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): Criterion criterion, intp_t max_features, intp_t min_samples_leaf, - double min_weight_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, + float64_t feature_combinations, *argv ): """ @@ -165,11 +165,11 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): which would result in having less samples in a leaf are not considered. - min_weight_leaf : double + min_weight_leaf : float64_t The minimal weight each leaf can have, where the weight is the sum of the weights of each sample in it. - feature_combinations : double + feature_combinations : float64_t The average number of features to combine in an oblique split. Each feature is independently included with probability ``feature_combination`` / ``n_features``. @@ -290,11 +290,11 @@ cdef class BestObliqueSplitter(ObliqueSplitter): cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil: """Find the best_split split on node samples[start:end] @@ -317,8 +317,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # keep track of split record for current_split node and the best_split split # found among the sampled projection vectors cdef ObliqueSplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY cdef intp_t feat_i, p # index over computed features and start/end cdef intp_t partition_end @@ -472,7 +472,7 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): min_feature_value_out[0] = min_feature_value max_feature_value_out[0] = max_feature_value - cdef inline intp_t partition_samples(self, double current_threshold) noexcept nogil: + cdef inline intp_t partition_samples(self, float64_t current_threshold) noexcept nogil: """Partition samples for feature_values at the current_threshold.""" cdef: intp_t p = self.start @@ -496,11 +496,11 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): # overwrite the node_split method with random threshold selection cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil: """Find the best_split split on node samples[start:end] @@ -520,13 +520,13 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): cdef float32_t[::1] feature_values = self.feature_values cdef intp_t max_features = self.max_features cdef intp_t min_samples_leaf = self.min_samples_leaf - cdef double min_weight_leaf = self.min_weight_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf # keep track of split record for current_split node and the best_split split # found among the sampled projection vectors cdef ObliqueSplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY cdef intp_t p cdef intp_t feat_i @@ -665,10 +665,10 @@ cdef class MultiViewSplitter(BestObliqueSplitter): Criterion criterion, intp_t max_features, intp_t min_samples_leaf, - double min_weight_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, + float64_t feature_combinations, const intp_t[:] feature_set_ends, intp_t n_feature_sets, *argv @@ -789,10 +789,10 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): Criterion criterion, intp_t max_features, intp_t min_samples_leaf, - double min_weight_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, + float64_t feature_combinations, const intp_t[:] feature_set_ends, intp_t n_feature_sets, bint uniform_sampling, diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index faed60b70..99a3d6fc0 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -72,13 +72,13 @@ cdef class ObliqueTree(Tree): feature : array of intp_t, shape [node_count] feature[i] holds the feature to split on, for the internal node i. - threshold : array of double, shape [node_count] + threshold : array of float64_t, shape [node_count] threshold[i] holds the threshold for the internal node i. - value : array of double, shape [node_count, n_outputs, max_n_classes] + value : array of float64_t, shape [node_count, n_outputs, max_n_classes] Contains the constant prediction value of each node. - impurity : array of double, shape [node_count] + impurity : array of float64_t, shape [node_count] impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. @@ -179,7 +179,7 @@ cdef class ObliqueTree(Tree): memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) + self.capacity * self.value_stride * sizeof(float64_t)) cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" @@ -220,7 +220,7 @@ cdef class ObliqueTree(Tree): # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: memset((self.value + self.capacity * self.value_stride), 0, - (capacity - self.capacity) * self.value_stride * sizeof(double)) + (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: diff --git a/sktree/tree/_utils.pxd b/sktree/tree/_utils.pxd index 4278d9f56..c814cc166 100644 --- a/sktree/tree/_utils.pxd +++ b/sktree/tree/_utils.pxd @@ -4,8 +4,8 @@ cimport numpy as cnp cnp.import_array() -from sktree._lib.sklearn.tree._splitter cimport SplitRecord -from sktree._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t +from .._lib.sklearn.tree._splitter cimport SplitRecord +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t cdef int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx index 7e2ca3bd1..197b82ecf 100644 --- a/sktree/tree/_utils.pyx +++ b/sktree/tree/_utils.pyx @@ -11,7 +11,7 @@ cimport numpy as cnp cnp.import_array() -from sktree._lib.sklearn.tree._utils cimport rand_uniform +from .._lib.sklearn.tree._utils cimport rand_uniform cdef inline int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil: diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index a687e4028..409b7d2c7 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -45,7 +45,7 @@ cdef class PatchSplitter(BestObliqueSplitter): self, intp_t start, intp_t end, - double* weighted_n_node_samples + float64_t* weighted_n_node_samples ) except -1 nogil: """Reset splitter on node samples[start:end]. @@ -58,7 +58,7 @@ cdef class PatchSplitter(BestObliqueSplitter): The index of the first sample to consider end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ @@ -122,10 +122,10 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): Criterion criterion, intp_t max_features, intp_t min_samples_leaf, - double min_weight_leaf, + float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - double feature_combinations, + float64_t feature_combinations, const intp_t[:] min_patch_dims, const intp_t[:] max_patch_dims, const cnp.uint8_t[:] dim_contiguous, diff --git a/sktree/tree/unsupervised/_unsup_criterion.pxd b/sktree/tree/unsupervised/_unsup_criterion.pxd index ba64b6d6c..8ef633875 100644 --- a/sktree/tree/unsupervised/_unsup_criterion.pxd +++ b/sktree/tree/unsupervised/_unsup_criterion.pxd @@ -33,13 +33,13 @@ cdef class UnsupervisedCriterion(BaseCriterion): # the left and right node. For example, this can then efficiently compute the # mean of the node, and left/right child by subtracting relevant Xf elements # and then dividing by the total number of samples in the node and left/right child. - cdef double sum_total # The sum of the weighted count of each feature. - cdef double sum_left # Same as above, but for the left side of the split - cdef double sum_right # Same as above, but for the right side of the split + cdef float64_t sum_total # The sum of the weighted count of each feature. + cdef float64_t sum_left # Same as above, but for the left side of the split + cdef float64_t sum_right # Same as above, but for the right side of the split - cdef double sumsq_total # The sum of the weighted count of each feature. - cdef double sumsq_left # Same as above, but for the left side of the split - cdef double sumsq_right # Same as above, but for the right side of the split + cdef float64_t sumsq_total # The sum of the weighted count of each feature. + cdef float64_t sumsq_left # Same as above, but for the left side of the split + cdef float64_t sumsq_right # Same as above, but for the right side of the split # Methods # ------- @@ -50,7 +50,7 @@ cdef class UnsupervisedCriterion(BaseCriterion): self, const float32_t[:] feature_values, const float64_t[:] sample_weight, - double weighted_n_samples, + float64_t weighted_n_samples, const intp_t[:] samples, ) except -1 nogil diff --git a/sktree/tree/unsupervised/_unsup_criterion.pyx b/sktree/tree/unsupervised/_unsup_criterion.pyx index 06b82881d..58f21f348 100644 --- a/sktree/tree/unsupervised/_unsup_criterion.pyx +++ b/sktree/tree/unsupervised/_unsup_criterion.pyx @@ -97,7 +97,7 @@ cdef class UnsupervisedCriterion(BaseCriterion): self, const float32_t[:] feature_values, const float64_t[:] sample_weight, - double weighted_n_samples, + float64_t weighted_n_samples, const intp_t[:] sample_indices, ) except -1 nogil: """Initialize the unsuperivsed criterion. @@ -111,7 +111,7 @@ cdef class UnsupervisedCriterion(BaseCriterion): The memoryview 1D feature vector with (n_samples,) shape. sample_weight : array-like, dtype=float64_t The weight of each sample (i.e. row of X). - weighted_n_samples : double + weighted_n_samples : float64_t The total weight of all sample_indices. sample_indices : array-like, dtype=intp_t A mask on the sample_indices, showing which ones we want to use @@ -225,13 +225,13 @@ cdef class UnsupervisedCriterion(BaseCriterion): cdef void node_value( self, - double* dest + float64_t* dest ) noexcept nogil: """Set the node value with sum_total and save it into dest. Parameters ---------- - dest : double pointer + dest : float64_t pointer The memory address which we will save the node value into. """ # set values at the address pointer is pointing to with the total value @@ -307,7 +307,7 @@ cdef class TwoMeans(UnsupervisedCriterion): pair minimizes the splitting criteria described in the following section """ - cdef double node_impurity( + cdef float64_t node_impurity( self ) noexcept nogil: """Evaluate the impurity of the current node. @@ -316,7 +316,7 @@ cdef class TwoMeans(UnsupervisedCriterion): i.e. the variance of Xf[sample_indices[start:end]]. The smaller the impurity the better. """ - cdef double impurity + cdef float64_t impurity # then compute the impurity as the variance impurity = self.fast_variance(self.weighted_n_node_samples, self.sumsq_total, self.sum_total) @@ -324,8 +324,8 @@ cdef class TwoMeans(UnsupervisedCriterion): cdef void children_impurity( self, - double* impurity_left, - double* impurity_right + float64_t* impurity_left, + float64_t* impurity_right ) noexcept nogil: """Evaluate the impurity in children nodes. @@ -340,9 +340,9 @@ cdef class TwoMeans(UnsupervisedCriterion): Parameters ---------- - impurity_left : double pointer + impurity_left : float64_t pointer The memory address to save the impurity of the left node - impurity_right : double pointer + impurity_right : float64_t pointer The memory address to save the impurity of the right node """ # set values at the address pointer is pointing to with the variance @@ -350,11 +350,11 @@ cdef class TwoMeans(UnsupervisedCriterion): impurity_left[0] = self.fast_variance(self.weighted_n_left, self.sumsq_left, self.sum_left) impurity_right[0] = self.fast_variance(self.weighted_n_right, self.sumsq_right, self.sum_right) - cdef inline double fast_variance( + cdef inline float64_t fast_variance( self, - double weighted_n_node_samples, - double sumsq_total, - double sum_total + float64_t weighted_n_node_samples, + float64_t sumsq_total, + float64_t sum_total ) noexcept nogil: return (1. / weighted_n_node_samples) * \ ((sumsq_total) - (1. / weighted_n_node_samples) * (sum_total * sum_total)) @@ -391,10 +391,10 @@ cdef class FastBIC(TwoMeans): Reference: https://arxiv.org/abs/1907.02844 """ - cdef inline double bic_cluster( + cdef inline float64_t bic_cluster( self, intp_t n_samples, - double variance + float64_t variance ) noexcept nogil: """Help compute the BIC from assigning to a specific cluster. @@ -402,7 +402,7 @@ cdef class FastBIC(TwoMeans): ---------- n_samples : intp_t The number of samples assigned cluster. - variance : double + variance : float64_t The plug-in variance for assigning to specific cluster. Notes @@ -421,13 +421,13 @@ cdef class FastBIC(TwoMeans): """ # chances of choosing the cluster based on how many samples are hard-assigned to cluster # i.e. the prior - # cast to double, so we do not round to integers - cdef double w_cluster = (n_samples + 0.0) / self.n_node_samples + # cast to float64_t, so we do not round to integers + cdef float64_t w_cluster = (n_samples + 0.0) / self.n_node_samples # add to prevent taking log of 0 when there is a degenerate cluster (i.e. single sample, or no variance) return -2. * (n_samples * log(w_cluster) + 0.5 * n_samples * log(2. * PI * variance + 1.e-7)) - cdef double node_impurity( + cdef float64_t node_impurity( self ) noexcept nogil: """Evaluate the impurity of the current node. @@ -437,8 +437,8 @@ cdef class FastBIC(TwoMeans): Namely, this is the maximum likelihood of Xf[sample_indices[start:end]]. The smaller the impurity the better. """ - cdef double variance - cdef double impurity + cdef float64_t variance + cdef float64_t impurity # then compute the variance of the cluster variance = self.fast_variance(self.weighted_n_node_samples, self.sumsq_total, self.sum_total) @@ -451,8 +451,8 @@ cdef class FastBIC(TwoMeans): cdef void children_impurity( self, - double* impurity_left, - double* impurity_right + float64_t* impurity_left, + float64_t* impurity_right ) noexcept nogil: """Evaluate the impurity in children nodes. @@ -461,9 +461,9 @@ cdef class FastBIC(TwoMeans): Parameters ---------- - impurity_left : double pointer + impurity_left : float64_t pointer The memory address to save the impurity of the left node - impurity_right : double pointer + impurity_right : float64_t pointer The memory address to save the impurity of the right node """ cdef intp_t pos = self.pos @@ -471,10 +471,10 @@ cdef class FastBIC(TwoMeans): cdef intp_t end = self.end cdef intp_t n_samples_left, n_samples_right - cdef double variance_left, variance_right, variance_comb - cdef double BIC_diff_var_left, BIC_diff_var_right - cdef double BIC_same_var_left, BIC_same_var_right - cdef double BIC_same_var, BIC_diff_var + cdef float64_t variance_left, variance_right, variance_comb + cdef float64_t BIC_diff_var_left, BIC_diff_var_right + cdef float64_t BIC_same_var_left, BIC_same_var_right + cdef float64_t BIC_same_var, BIC_diff_var # number of samples of left and right n_samples_left = pos - start diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd index cc50c200c..685db473a 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd @@ -14,10 +14,10 @@ cdef struct ObliqueSplitRecord: intp_t pos # Split samples array at the given position, # # i.e. count of samples below threshold for feature. # # pos is >= end if the node is a leaf. - double threshold # Threshold to split at. - double improvement # Impurity improvement given parent node. - double impurity_left # Impurity of the left split. - double impurity_right # Impurity of the right split. + float64_t threshold # Threshold to split at. + float64_t improvement # Impurity improvement given parent node. + float64_t impurity_left # Impurity of the left split. + float64_t impurity_right # Impurity of the right split. vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) vector[intp_t]* proj_vec_indices # indices of the features (max_features,) @@ -34,7 +34,7 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): """ # Oblique Splitting extra parameters - cdef public double feature_combinations # Number of features to combine + cdef public float64_t feature_combinations # Number of features to combine cdef intp_t n_non_zeros # Number of non-zero features cdef vector[vector[float32_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix cdef vector[vector[intp_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix @@ -49,15 +49,15 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): # Redefined here since the new logic requires calling sample_proj_mat cdef intp_t node_reset(self, intp_t start, intp_t end, - double* weighted_n_node_samples) except -1 nogil + float64_t* weighted_n_node_samples) except -1 nogil cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound + float64_t lower_bound, + float64_t upper_bound ) except -1 nogil cdef intp_t init( self, diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 65ccb9ec0..9b46c2c01 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -18,7 +18,7 @@ from .._sklearn_splitter cimport sort from ._unsup_criterion cimport UnsupervisedCriterion -cdef double INFINITY = np.inf +cdef float64_t INFINITY = np.inf # Mitigate precision differences between 32 bit and 64 bit cdef float32_t FEATURE_THRESHOLD = 1e-7 @@ -49,9 +49,9 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): UnsupervisedCriterion criterion, intp_t max_features, intp_t min_samples_leaf, - double min_weight_leaf, + float64_t min_weight_leaf, object random_state, - double feature_combinations, + float64_t feature_combinations, *argv ): """ @@ -69,11 +69,11 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): which would result in having less samples in a leaf are not considered. - min_weight_leaf : double + min_weight_leaf : float64_t The minimal weight each leaf can have, where the weight is the sum of the weights of each sample in it. - feature_combinations : double + feature_combinations : float64_t The average number of features to combine in an oblique split. Each feature is independently included with probability ``feature_combination`` / ``n_features``. @@ -129,7 +129,7 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): return 0 cdef intp_t node_reset(self, intp_t start, intp_t end, - double* weighted_n_node_samples) except -1 nogil: + float64_t* weighted_n_node_samples) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -141,7 +141,7 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): The index of the first sample to consider end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ # call parent reset @@ -244,11 +244,11 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound, + float64_t lower_bound, + float64_t upper_bound, ) except -1 nogil: """Find the best_split split on node samples[start:end] @@ -267,13 +267,13 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): cdef float32_t[::1] feature_values = self.feature_values cdef intp_t max_features = self.max_features cdef intp_t min_samples_leaf = self.min_samples_leaf - cdef double min_weight_leaf = self.min_weight_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf # keep track of split record for current_split node and the best_split split # found among the sampled projection vectors cdef ObliqueSplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY cdef intp_t feat_i, p # index over computed features and start/end cdef intp_t partition_end diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 5c2931f62..c15a4b844 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -70,13 +70,13 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): feature : array of intp_t, shape [node_count] feature[i] holds the feature to split on, for the internal node i. - threshold : array of double, shape [node_count] + threshold : array of float64_t, shape [node_count] threshold[i] holds the threshold for the internal node i. - value : array of double, shape [node_count, n_outputs, max_n_classes] + value : array of float64_t, shape [node_count, n_outputs, max_n_classes] Contains the constant prediction value of each node. - impurity : array of double, shape [node_count] + impurity : array of float64_t, shape [node_count] impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. @@ -159,7 +159,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) + self.capacity * self.value_stride * sizeof(float64_t)) cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" @@ -200,7 +200,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: memset((self.value + self.capacity * self.value_stride), 0, - (capacity - self.capacity) * self.value_stride * sizeof(double)) + (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: diff --git a/sktree/tree/unsupervised/_unsup_splitter.pxd b/sktree/tree/unsupervised/_unsup_splitter.pxd index 770ddafc9..d7d7da756 100644 --- a/sktree/tree/unsupervised/_unsup_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_splitter.pxd @@ -34,20 +34,20 @@ cdef class UnsupervisedSplitter(BaseSplitter): self, intp_t start, intp_t end, - double* weighted_n_node_samples + float64_t* weighted_n_node_samples ) except -1 nogil cdef intp_t node_split( self, - double impurity, # Impurity of the node + float64_t impurity, # Impurity of the node SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound + float64_t lower_bound, + float64_t upper_bound ) except -1 nogil cdef void node_value( self, - double* dest + float64_t* dest ) noexcept nogil - cdef double node_impurity( + cdef float64_t node_impurity( self ) noexcept nogil diff --git a/sktree/tree/unsupervised/_unsup_splitter.pyx b/sktree/tree/unsupervised/_unsup_splitter.pyx index c996b0978..0ff20b6b8 100644 --- a/sktree/tree/unsupervised/_unsup_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_splitter.pyx @@ -16,7 +16,7 @@ from sklearn.tree._utils cimport RAND_R_MAX, rand_int from .._sklearn_splitter cimport sort -cdef double INFINITY = np.inf +cdef float64_t INFINITY = np.inf # Mitigate precision differences between 32 bit and 64 bit cdef float32_t FEATURE_THRESHOLD = 1e-7 @@ -38,7 +38,7 @@ cdef class UnsupervisedSplitter(BaseSplitter): """Base class for unsupervised splitters.""" def __cinit__(self, UnsupervisedCriterion criterion, intp_t max_features, - intp_t min_samples_leaf, double min_weight_leaf, + intp_t min_samples_leaf, float64_t min_weight_leaf, object random_state, *argv): """ Parameters @@ -55,7 +55,7 @@ cdef class UnsupervisedSplitter(BaseSplitter): which would result in having less samples in a leaf are not considered. - min_weight_leaf : double + min_weight_leaf : float64_t The minimal weight each leaf can have, where the weight is the sum of the weights of each sample in it. @@ -93,7 +93,7 @@ cdef class UnsupervisedSplitter(BaseSplitter): cdef intp_t[::1] samples = self.samples cdef intp_t i, j - cdef double weighted_n_samples = 0.0 + cdef float64_t weighted_n_samples = 0.0 j = 0 for i in range(n_samples): @@ -137,7 +137,7 @@ cdef class UnsupervisedSplitter(BaseSplitter): return 0 cdef intp_t node_reset(self, intp_t start, intp_t end, - double* weighted_n_node_samples) except -1 nogil: + float64_t* weighted_n_node_samples) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -149,7 +149,7 @@ cdef class UnsupervisedSplitter(BaseSplitter): The index of the first sample to consider end : intp_t The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=double pointer + weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ @@ -161,12 +161,12 @@ cdef class UnsupervisedSplitter(BaseSplitter): weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef void node_value(self, double* dest) noexcept nogil: + cdef void node_value(self, float64_t* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" self.criterion.node_value(dest) - cdef double node_impurity(self) noexcept nogil: + cdef float64_t node_impurity(self) noexcept nogil: """Return the impurity of the current_split node.""" return self.criterion.node_impurity() @@ -176,11 +176,11 @@ cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): """ cdef intp_t node_split( self, - double impurity, + float64_t impurity, SplitRecord* split, intp_t* n_constant_features, - double lower_bound, - double upper_bound + float64_t lower_bound, + float64_t upper_bound ) except -1 nogil: """Find the best_split split on node samples[start:end]. @@ -209,11 +209,11 @@ cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): cdef uint32_t* random_state = &self.rand_r_state # XXX: maybe need to rename to something else - cdef double min_weight_leaf = self.min_weight_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf cdef SplitRecord best_split, current_split - cdef double current_proxy_improvement = -INFINITY - cdef double best_proxy_improvement = -INFINITY + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY cdef intp_t f_i = n_features cdef intp_t f_j diff --git a/sktree/tree/unsupervised/_unsup_tree.pxd b/sktree/tree/unsupervised/_unsup_tree.pxd index 8b1851fc8..fe4a4630c 100644 --- a/sktree/tree/unsupervised/_unsup_tree.pxd +++ b/sktree/tree/unsupervised/_unsup_tree.pxd @@ -24,7 +24,7 @@ cdef class UnsupervisedTree(BaseTree): # # Inner structures: values are stored separately from node structure, # since size is determined at runtime. - # cdef double* value # (capacity) array of values + # cdef float64_t* value # (capacity) array of values # cdef intp_t value_stride # = 1 # Input/Output layout @@ -69,9 +69,9 @@ cdef class UnsupervisedTreeBuilder: cdef intp_t min_samples_split # Minimum number of samples in an internal node cdef intp_t min_samples_leaf # Minimum number of samples in a leaf - cdef double min_weight_leaf # Minimum weight in a leaf + cdef float64_t min_weight_leaf # Minimum weight in a leaf cdef intp_t max_depth # Maximal tree depth - cdef double min_impurity_decrease # Impurity threshold for early stopping + cdef float64_t min_impurity_decrease # Impurity threshold for early stopping cpdef build( self, diff --git a/sktree/tree/unsupervised/_unsup_tree.pyx b/sktree/tree/unsupervised/_unsup_tree.pyx index dbb289fdb..6361e829a 100644 --- a/sktree/tree/unsupervised/_unsup_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_tree.pyx @@ -51,8 +51,8 @@ from numpy import float32 as DTYPE from numpy import float64 as DOUBLE -cdef double INFINITY = np.inf -cdef double EPSILON = np.finfo('double').eps +cdef float64_t INFINITY = np.inf +cdef float64_t EPSILON = np.finfo('float64_t').eps # Some handy constants (BestFirstTreeBuilder) cdef intp_t IS_FIRST = 1 @@ -127,10 +127,10 @@ cdef struct FrontierRecord: intp_t pos intp_t depth bint is_leaf - double impurity - double impurity_left - double impurity_right - double improvement + float64_t impurity + float64_t impurity_left + float64_t impurity_right + float64_t improvement # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing @@ -140,7 +140,7 @@ cdef struct StackRecord: intp_t depth intp_t parent bint is_left - double impurity + float64_t impurity intp_t n_constant_features cdef inline bool _compare_records( @@ -171,10 +171,10 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): UnsupervisedSplitter splitter, intp_t min_samples_split, intp_t min_samples_leaf, - double min_weight_leaf, + float64_t min_weight_leaf, intp_t max_depth, intp_t max_leaf_nodes, - double min_impurity_decrease + float64_t min_impurity_decrease ): self.splitter = splitter self.min_samples_split = min_samples_split @@ -291,7 +291,7 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): UnsupervisedTree tree, intp_t start, intp_t end, - double impurity, + float64_t impurity, bint is_first, bint is_left, Node* parent, @@ -308,8 +308,8 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): cdef intp_t node_id cdef intp_t n_node_samples cdef intp_t n_constant_features = 0 - cdef double min_impurity_decrease = self.min_impurity_decrease - cdef double weighted_n_node_samples + cdef float64_t min_impurity_decrease = self.min_impurity_decrease + cdef float64_t weighted_n_node_samples cdef bint is_leaf splitter.node_reset(start, end, &weighted_n_node_samples) @@ -384,9 +384,9 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): UnsupervisedSplitter splitter, intp_t min_samples_split, intp_t min_samples_leaf, - double min_weight_leaf, + float64_t min_weight_leaf, intp_t max_depth, - double min_impurity_decrease + float64_t min_impurity_decrease ): self.splitter = splitter self.min_samples_split = min_samples_split @@ -420,9 +420,9 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): cdef UnsupervisedSplitter splitter = self.splitter cdef intp_t max_depth = self.max_depth cdef intp_t min_samples_leaf = self.min_samples_leaf - cdef double min_weight_leaf = self.min_weight_leaf + cdef float64_t min_weight_leaf = self.min_weight_leaf cdef intp_t min_samples_split = self.min_samples_split - cdef double min_impurity_decrease = self.min_impurity_decrease + cdef float64_t min_impurity_decrease = self.min_impurity_decrease # Recursive partition (without actual recursion) splitter.init(X, sample_weight) @@ -433,7 +433,7 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): cdef intp_t parent cdef bint is_left cdef intp_t n_node_samples = splitter.n_samples - cdef double weighted_n_node_samples + cdef float64_t weighted_n_node_samples cdef intp_t node_id # initialize record to keep track of split node data and a pointer to the @@ -442,7 +442,7 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): cdef SplitRecord split cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) - cdef double impurity = INFINITY + cdef float64_t impurity = INFINITY cdef intp_t n_constant_features cdef bint is_leaf cdef bint first = 1 @@ -597,20 +597,20 @@ cdef class UnsupervisedTree(BaseTree): feature : array of intp_t, shape [node_count] feature[i] holds the feature to split on, for the internal node i. - threshold : array of double, shape [node_count] + threshold : array of float64_t, shape [node_count] threshold[i] holds the threshold for the internal node i. - value : array of double, shape [node_count] + value : array of float64_t, shape [node_count] Contains the constant prediction value of each node. - impurity : array of double, shape [node_count] + impurity : array of float64_t, shape [node_count] impurity[i] holds the impurity (i.e., the value of the splitting criterion) at node i. n_node_samples : array of intp_t, shape [node_count] n_node_samples[i] holds the number of training samples reaching node i. - weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples : array of float64_t, shape [node_count] weighted_n_node_samples[i] holds the weighted number of training samples reaching node i. """ @@ -718,7 +718,7 @@ cdef class UnsupervisedTree(BaseTree): memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) + self.capacity * self.value_stride * sizeof(float64_t)) cdef cnp.ndarray _get_value_ndarray(self): """Wraps value as a 3-d NumPy array.