Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT Submoduletest #250

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sktree/_lib/sklearn_fork
Submodule sklearn_fork updated 157 files
3 changes: 1 addition & 2 deletions sktree/tree/_marginal.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ import numpy as np
cimport numpy as cnp

from .._lib.sklearn.tree._tree cimport BaseTree, Node
from .._lib.sklearn.tree._utils cimport UINT32_t
from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t


cpdef apply_marginal_tree(
Expand Down
6 changes: 3 additions & 3 deletions sktree/tree/_marginal.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ cpdef apply_marginal_tree(
cdef intp_t n_marginals = marginal_indices.shape[0]

# sklearn_rand_r random number state
cdef UINT32_t rand_r_state = random_state.randint(0, RAND_R_MAX)
cdef uint32_t rand_r_state = random_state.randint(0, RAND_R_MAX)

# define a set of all marginal indices
cdef unordered_set[intp_t] marginal_indices_map
Expand Down Expand Up @@ -108,7 +108,7 @@ cdef inline cnp.ndarray _apply_dense_marginal(
unordered_set[intp_t] marginal_indices_map,
intp_t traversal_method,
unsigned char use_sample_weight,
UINT32_t* rand_r_state
uint32_t* rand_r_state
):
"""Finds the terminal region (=leaf node) for each sample in X.

Expand All @@ -131,7 +131,7 @@ cdef inline cnp.ndarray _apply_dense_marginal(
use_sample_weight : unsigned char
Whether or not to use the weighted number of samples
in each node.
rand_r_state : UINT32_t
rand_r_state : uint32_t
The random number state.
"""
# Extract input
Expand Down
22 changes: 6 additions & 16 deletions sktree/tree/_oblique_splitter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@

import numpy as np

cimport numpy as cnp
from libcpp.vector cimport vector

from .._lib.sklearn.tree._criterion cimport Criterion
from .._lib.sklearn.tree._splitter cimport SplitRecord, Splitter
from .._lib.sklearn.tree._utils cimport UINT32_t
from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
from .._lib.sklearn.tree._tree cimport ParentInfo
from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint32_t
from ._sklearn_splitter cimport sort


Expand All @@ -30,11 +29,8 @@ cdef struct ObliqueSplitRecord:
float64_t improvement # Impurity improvement given parent node.
float64_t impurity_left # Impurity of the left split.
float64_t impurity_right # Impurity of the right split.
float64_t lower_bound # Lower bound on value of both children for monotonicity
float64_t upper_bound # Upper bound on value of both children for monotonicity
unsigned char missing_go_to_left # Controls if missing values go to the left node.
intp_t n_missing # Number of missing values for the feature being split on
intp_t n_constant_features # Number of constant features in the split

vector[float32_t]* proj_vec_weights # weights of the vector (max_features,)
vector[intp_t]* proj_vec_indices # indices of the features (max_features,)
Expand Down Expand Up @@ -83,17 +79,15 @@ cdef class BaseObliqueSplitter(Splitter):

cdef int node_split(
self,
float64_t impurity, # Impurity of the node
ParentInfo* parent,
SplitRecord* split,
float64_t lower_bound,
float64_t upper_bound,
) except -1 nogil

cdef inline void fisher_yates_shuffle_memview(
self,
intp_t[::1] indices_to_sample,
intp_t grid_size,
UINT32_t* random_state
uint32_t* random_state
) noexcept nogil

cdef class ObliqueSplitter(BaseObliqueSplitter):
Expand All @@ -118,10 +112,8 @@ cdef class ObliqueSplitter(BaseObliqueSplitter):
cdef class BestObliqueSplitter(ObliqueSplitter):
cdef int node_split(
self,
float64_t impurity, # Impurity of the node
ParentInfo* parent,
SplitRecord* split,
float64_t lower_bound,
float64_t upper_bound,
) except -1 nogil


Expand All @@ -140,10 +132,8 @@ cdef class RandomObliqueSplitter(ObliqueSplitter):

cdef int node_split(
self,
float64_t impurity, # Impurity of the node
ParentInfo* parent,
SplitRecord* split,
float64_t lower_bound,
float64_t upper_bound,
) except -1 nogil


Expand Down
46 changes: 30 additions & 16 deletions sktree/tree/_oblique_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ cdef inline void _init_split(ObliqueSplitRecord* self, intp_t start_pos) noexcep
self.feature = 0
self.threshold = 0.
self.improvement = -INFINITY
self.missing_go_to_left = False
self.n_missing = 0


cdef class BaseObliqueSplitter(Splitter):
Expand Down Expand Up @@ -128,7 +130,7 @@ cdef class BaseObliqueSplitter(Splitter):
self,
intp_t[::1] indices_to_sample,
intp_t grid_size,
UINT32_t* random_state,
uint32_t* random_state,
) noexcept nogil:
cdef intp_t i, j

Expand All @@ -146,7 +148,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter):
intp_t min_samples_leaf,
float64_t min_weight_leaf,
object random_state,
const cnp.int8_t[:] monotonic_cst,
const int8_t[:] monotonic_cst,
float64_t feature_combinations,
*argv
):
Expand Down Expand Up @@ -245,7 +247,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter):
"""
cdef intp_t n_features = self.n_features
cdef intp_t n_non_zeros = self.n_non_zeros
cdef UINT32_t* random_state = &self.rand_r_state
cdef uint32_t* random_state = &self.rand_r_state

cdef intp_t i, feat_i, proj_i, rand_vec_index
cdef float32_t weight
Expand Down Expand Up @@ -290,10 +292,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter):

cdef int node_split(
self,
float64_t impurity,
ParentInfo* parent_record,
SplitRecord* split,
float64_t lower_bound,
float64_t upper_bound,
) except -1 nogil:
"""Find the best_split split on node samples[start:end]

Expand All @@ -319,6 +319,11 @@ cdef class BestObliqueSplitter(ObliqueSplitter):
cdef float64_t current_proxy_improvement = -INFINITY
cdef float64_t best_proxy_improvement = -INFINITY

with gil:
print("Accessing parent record")

cdef float64_t impurity = parent_record.impurity

cdef intp_t feat_i, p # index over computed features and start/end
cdef intp_t partition_end
cdef float32_t temp_d # to compute a projection feature value
Expand Down Expand Up @@ -430,7 +435,14 @@ cdef class BestObliqueSplitter(ObliqueSplitter):
deref(oblique_split).improvement = best_split.improvement
deref(oblique_split).impurity_left = best_split.impurity_left
deref(oblique_split).impurity_right = best_split.impurity_right
deref(oblique_split).n_constant_features = 0

with gil:
print("Segfaulting here...")
# XXX: Fix
parent_record.n_constant_features = 0

with gil:
print("Got past")
return 0

cdef class RandomObliqueSplitter(ObliqueSplitter):
Expand Down Expand Up @@ -496,10 +508,8 @@ cdef class RandomObliqueSplitter(ObliqueSplitter):
# overwrite the node_split method with random threshold selection
cdef int node_split(
self,
float64_t impurity,
ParentInfo* parent_record,
SplitRecord* split,
float64_t lower_bound,
float64_t upper_bound,
) except -1 nogil:
"""Find the best_split split on node samples[start:end]

Expand All @@ -513,7 +523,7 @@ cdef class RandomObliqueSplitter(ObliqueSplitter):
cdef intp_t[::1] samples = self.samples
cdef intp_t start = self.start
cdef intp_t end = self.end
cdef UINT32_t* random_state = &self.rand_r_state
cdef uint32_t* random_state = &self.rand_r_state

# pointer array to store feature values to split on
cdef float32_t[::1] feature_values = self.feature_values
Expand All @@ -534,6 +544,8 @@ cdef class RandomObliqueSplitter(ObliqueSplitter):
cdef float32_t min_feature_value
cdef float32_t max_feature_value

cdef float64_t impurity = parent_record.impurity

# Number of features discovered to be constant during the split search
# cdef intp_t n_found_constants = 0
# cdef intp_t n_known_constants = n_constant_features[0]
Expand Down Expand Up @@ -655,7 +667,9 @@ cdef class RandomObliqueSplitter(ObliqueSplitter):
deref(oblique_split).improvement = best_split.improvement
deref(oblique_split).impurity_left = best_split.impurity_left
deref(oblique_split).impurity_right = best_split.impurity_right
# deref(oblique_split).n_constant_features = 0

# XXX: Fix
parent_record.n_constant_features = 0
return 0


Expand All @@ -667,7 +681,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter):
intp_t min_samples_leaf,
float64_t min_weight_leaf,
object random_state,
const cnp.int8_t[:] monotonic_cst,
const int8_t[:] monotonic_cst,
float64_t feature_combinations,
const intp_t[:] feature_set_ends,
intp_t n_feature_sets,
Expand Down Expand Up @@ -740,7 +754,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter):
This proceeds as a normal sampling projection matrix,
but now also uniformly samples features from each feature set.
"""
cdef UINT32_t* random_state = &self.rand_r_state
cdef uint32_t* random_state = &self.rand_r_state
cdef intp_t feat_i, proj_i
cdef float32_t weight

Expand Down Expand Up @@ -839,7 +853,7 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter):
intp_t min_samples_leaf,
float64_t min_weight_leaf,
object random_state,
const cnp.int8_t[:] monotonic_cst,
const int8_t[:] monotonic_cst,
float64_t feature_combinations,
const intp_t[:] feature_set_ends,
intp_t n_feature_sets,
Expand Down Expand Up @@ -914,7 +928,7 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter):
"""
cdef intp_t n_features = self.n_features
cdef intp_t n_non_zeros = self.n_non_zeros
cdef UINT32_t* random_state = &self.rand_r_state
cdef uint32_t* random_state = &self.rand_r_state

cdef intp_t i, j, feat_i, proj_i, rand_vec_index
cdef float32_t weight
Expand Down
5 changes: 2 additions & 3 deletions sktree/tree/_utils.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ cimport numpy as cnp
cnp.import_array()

from .._lib.sklearn.tree._splitter cimport SplitRecord
from .._lib.sklearn.tree._utils cimport UINT32_t
from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t
from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t


cdef int rand_weighted_binary(float64_t p0, UINT32_t* random_state) noexcept nogil
cdef int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil

cpdef unravel_index(
intp_t index, cnp.ndarray[intp_t, ndim=1] shape
Expand Down
4 changes: 2 additions & 2 deletions sktree/tree/_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ cnp.import_array()
from .._lib.sklearn.tree._utils cimport rand_uniform


cdef inline int rand_weighted_binary(float64_t p0, UINT32_t* random_state) noexcept nogil:
cdef inline int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil:
"""Sample from integers 0 and 1 with different probabilities.

Parameters
----------
p0 : float64_t
The probability of sampling 0.
random_state : UINT32_t*
random_state : uint32_t*
The random state.
"""
cdef float64_t random_value = rand_uniform(0.0, 1.0, random_state)
Expand Down
12 changes: 2 additions & 10 deletions sktree/tree/manifold/_morf_splitter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@

import numpy as np

cimport numpy as cnp
from libcpp.vector cimport vector

from ..._lib.sklearn.tree._splitter cimport SplitRecord
from ..._lib.sklearn.tree._utils cimport UINT32_t
from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t
from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord

# https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/algorithm.pxd
Expand All @@ -29,12 +27,6 @@ from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord
# discrete_distribution(T first, T last) except +
# operator()(&G) except +

# XXX: replace with from libcpp.algorithm cimport swap
# when Cython 3.0 is released
cdef extern from "<algorithm>" namespace "std" nogil:
void swap[T](T& a, T& b) except + # array overload also works


cdef class PatchSplitter(BestObliqueSplitter):
# The PatchSplitter creates candidate feature values by sampling 2D patches from
# an input data vector. The input data is vectorized, so `data_height` and
Expand All @@ -53,7 +45,7 @@ cdef class PatchSplitter(BestObliqueSplitter):
cdef const intp_t[:] data_dims # The dimensions of the input data
cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension
cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension
cdef const cnp.uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous
cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous

# TODO: check if this works and is necessary for discontiguous data
# cdef intp_t[:] stride_offsets # The stride offsets for each dimension
Expand Down
12 changes: 4 additions & 8 deletions sktree/tree/manifold/_morf_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@

import numpy as np

cimport numpy as cnp

cnp.import_array()

from cython.operator cimport dereference as deref
from libcpp.vector cimport vector

Expand Down Expand Up @@ -124,11 +120,11 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
intp_t min_samples_leaf,
float64_t min_weight_leaf,
object random_state,
const cnp.int8_t[:] monotonic_cst,
const int8_t[:] monotonic_cst,
float64_t feature_combinations,
const intp_t[:] min_patch_dims,
const intp_t[:] max_patch_dims,
const cnp.uint8_t[:] dim_contiguous,
const uint8_t[:] dim_contiguous,
const intp_t[:] data_dims,
bytes boundary,
const float32_t[:, :] feature_weight,
Expand Down Expand Up @@ -212,7 +208,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
cdef intp_t top_left_patch_seed
cdef intp_t patch_size = 1

cdef UINT32_t* random_state = &self.rand_r_state
cdef uint32_t* random_state = &self.rand_r_state

# define parameters for the random patch
cdef intp_t patch_dim
Expand Down Expand Up @@ -315,7 +311,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
intp_t top_left_patch_seed,
const intp_t[:] patch_dims,
) noexcept nogil:
cdef UINT32_t* random_state = &self.rand_r_state
cdef uint32_t* random_state = &self.rand_r_state
# iterates over the size of the patch
cdef intp_t patch_idx

Expand Down
Loading
Loading