"""
Manifold forest of trees-based ensemble methods.
Those methods include various random forest methods that operate on manifolds.
"""
# Authors: Adam Li <adam2392@gmail.com>
#
# License: BSD 3 clause
from warnings import warn
import numpy as np
from scipy.sparse import issparse
from sklearn.base import ClusterMixin, TransformerMixin
from sklearn.cluster import AgglomerativeClustering
from sklearn.ensemble._forest import (
MAX_INT,
_generate_unsampled_indices,
_get_n_samples_bootstrap,
_parallel_build_trees,
)
from sklearn.metrics import calinski_harabasz_score
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import _check_sample_weight, check_is_fitted, check_random_state
from sktree._lib.sklearn.ensemble._forest import BaseForest
from sktree._lib.sklearn.tree._tree import DTYPE
from sktree.tree import UnsupervisedDecisionTree, UnsupervisedObliqueDecisionTree
from ..tree._neighbors import SimMatrixMixin
class ForestCluster(SimMatrixMixin, TransformerMixin, ClusterMixin, BaseForest):
"""Unsupervised forest base class."""
def __init__(
self,
estimator,
n_estimators=100,
*,
estimator_params=tuple(),
bootstrap=False,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
max_samples=None,
) -> None:
super().__init__(
estimator=estimator,
n_estimators=n_estimators,
estimator_params=estimator_params,
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
max_samples=max_samples,
)
def fit(self, X, y=None, sample_weight=None):
"""
Fit estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Use ``dtype=np.float32`` for maximum
efficiency. Sparse matrices are also supported, use sparse
``csc_matrix`` for maximum efficiency.
y : Ignored
Not used, present for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted. Splits
that would create child nodes with net zero or negative weight are
ignored while searching for a split in each node. In the case of
classification, splits are also ignored if they would result in any
single class carrying a negative weight in either child node.
Returns
-------
self : object
Returns the instance itself.
"""
self._validate_params()
# Validate or convert input data
X = self._validate_data(
X,
dtype=DTYPE, # accept_sparse="csc",
)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
if issparse(X):
# Pre-sort indices to avoid that each individual tree of the
# ensemble sorts the indices.
X.sort_indices()
if not self.bootstrap and self.max_samples is not None:
raise ValueError(
"`max_sample` cannot be set if `bootstrap=False`. "
"Either switch to `bootstrap=True` or set "
"`max_sample=None`."
)
elif self.bootstrap:
n_samples_bootstrap = _get_n_samples_bootstrap(
n_samples=X.shape[0], max_samples=self.max_samples
)
else:
n_samples_bootstrap = None
self._validate_estimator()
if not self.bootstrap and self.oob_score:
raise ValueError("Out of bag estimation only available if bootstrap=True")
random_state = check_random_state(self.random_state)
if not self.warm_start or not hasattr(self, "estimators_"):
# Free allocated memory, if any
self.estimators_ = []
n_more_estimators = self.n_estimators - len(self.estimators_)
if n_more_estimators < 0:
raise ValueError(
"n_estimators=%d must be larger or equal to "
"len(estimators_)=%d when warm_start==True"
% (self.n_estimators, len(self.estimators_))
)
elif n_more_estimators == 0:
warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.")
else:
if self.warm_start and len(self.estimators_) > 0:
# We draw from the random state to get the random state we
# would have got if we hadn't used a warm_start.
random_state.randint(MAX_INT, size=len(self.estimators_))
trees = [
self._make_estimator(append=False, random_state=random_state)
for i in range(n_more_estimators)
]
# Parallel loop: we prefer the threading backend as the Cython code
# for fitting the trees is internally releasing the Python GIL
# making threading more efficient than multiprocessing in
# that case. However, for joblib 0.12+ we respect any
# parallel_backend contexts set at a higher level,
# since correctness does not rely on using threads.
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads",)(
delayed(_parallel_build_trees)(
t,
self.bootstrap,
X,
y,
sample_weight,
i,
len(trees),
verbose=self.verbose,
class_weight=self.class_weight,
n_samples_bootstrap=n_samples_bootstrap,
)
for i, t in enumerate(trees)
)
# Collect newly grown trees
self.estimators_.extend(trees)
if self.oob_score:
if callable(self.oob_score):
self._set_oob_score_and_attributes(X, scoring_function=self.oob_score)
else:
self._set_oob_score_and_attributes(X)
# now compute the similarity/dissimilarity matrix and set it
sim_mat = self.compute_similarity_matrix(X)
# compute the labels and set it
self.labels_ = self._assign_labels(sim_mat)
return self
def predict(self, X):
"""Predict clusters for X.
The predicted class of an input sample is a vote by the trees in
the forest, weighted by their probability estimates. That is,
the predicted class is the one with highest mean probability
estimate across the trees.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Internally, its dtype will be converted to
``dtype=np.float32``. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.
Returns
-------
y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
The predicted classes.
"""
X = self._validate_X_predict(X)
similarity_matrix = self.transform(X)
# compute the labels and set it
return self._assign_labels(similarity_matrix)
def transform(self, X):
"""Transform X to a cluster-distance space.
In the new space, each dimension is the distance to the cluster
centers. Note that even if X is sparse, the array returned by
`transform` will typically be dense.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
New data to transform.
Returns
-------
X_new : ndarray of shape (n_samples, n_samples)
X transformed in the new space.
"""
check_is_fitted(self)
# now compute the affinity matrix and set it
similarity_matrix = self.compute_similarity_matrix(X)
return similarity_matrix
def _assign_labels(self, similarity_matrix):
"""Assign cluster labels given X.
Parameters
----------
similarity_matrix : ndarray of shape (n_samples, n_samples)
The affinity matrix.
Returns
-------
predict_labels : ndarray of shape (n_samples,)
The predicted cluster labels
"""
if self.clustering_func is None:
self.clustering_func_ = AgglomerativeClustering
else:
self.clustering_func_ = self.clustering_func
if self.clustering_func_args is None:
self.clustering_func_args_ = dict()
else:
self.clustering_func_args_ = self.clustering_func_args
cluster = self.clustering_func_(**self.clustering_func_args_)
# apply agglomerative clustering to obtain cluster labels
predict_labels = cluster.fit_predict(similarity_matrix)
return predict_labels
@staticmethod
def _get_oob_predictions(tree, X):
"""Compute the OOB transformations for an individual tree.
Parameters
----------
tree : UnsupervisedDecisionTree object
A single unsupervised decision tree model.
X : ndarray of shape (n_samples, n_features)
The OOB samples.
Returns
-------
tree_prox_matrix : ndarray of shape (n_samples, n_samples)
The OOB associated proximity matrix.
"""
# transform X
# now compute the affinity matrix and set it
tree_prox_matrix = tree.compute_similarity_matrix_forest(X)
return tree_prox_matrix
def _compute_oob_predictions(self, X, y=None):
"""Compute the OOB transformations.
This only uses the OOB samples per tree to compute the unnormalized
proximity matrix. These submatrices are then aggregated into the whole
proximity matrix and normalized based on how many times each sample
showed up in an OOB tree.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
y : ndarray of shape (n_samples, n_outputs)
Not used.
Returns
-------
oob_pred : ndarray of shape (n_samples, n_samples)
The OOB proximity matrix.
"""
# Prediction requires X to be in CSR format
if issparse(X):
X = X.tocsr()
n_samples = X.shape[0]
# for clustering, n_classes_ does not exist and we create an empty
# axis to be consistent with the classification case and make
# the array operations compatible with the 2 settings
oob_pred_shape = (n_samples, n_samples)
oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)
n_oob_pred = np.zeros((n_samples, n_samples), dtype=np.int64)
n_samples_bootstrap = _get_n_samples_bootstrap(
n_samples,
self.max_samples,
)
for estimator in self.estimators_:
unsampled_indices = _generate_unsampled_indices(
estimator.random_state,
n_samples,
n_samples_bootstrap,
)
tree_prox_matrix = self._get_oob_predictions(estimator, X[unsampled_indices, :])
oob_pred[np.ix_(unsampled_indices, unsampled_indices)] += tree_prox_matrix
n_oob_pred[np.ix_(unsampled_indices, unsampled_indices)] += 1
if (n_oob_pred == 0).any():
warn(
"Some inputs do not have OOB scores. This probably means "
"too few trees were used to compute any reliable OOB "
"estimates.",
UserWarning,
)
n_oob_pred[n_oob_pred == 0] = 1
# normalize by the number of times each oob sample proximity matrix was computed
oob_pred /= n_oob_pred
return oob_pred
def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
"""Compute and set the OOB score and attributes.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
y : ndarray of shape (n_samples, n_outputs)
The target matrix.
scoring_function : callable, default=None
Scoring function for OOB score. Default is the
:func:`sklearn.metrics.calinski_harabasz_score`.
Must not require true ``y_labels``.
"""
self.oob_decision_function_ = self._compute_oob_predictions(X)
if scoring_function is None:
scoring_function = calinski_harabasz_score
# assign labels
predict_labels = self._assign_labels(self.oob_decision_function_)
self.oob_labels_ = predict_labels
self.oob_score_ = scoring_function(X, predict_labels)
[docs]class UnsupervisedRandomForest(ForestCluster):
"""Unsupervised random forest.
An unsupervised random forest is inherently a clustering algorithm that also
simultaneously computes an adaptive affinity matrix that is based on the 0-1
tree distance (i.e. do samples fall within the same leaf).
Parameters
----------
n_estimators : int, optional
Number of trees to fit, by default 100.
criterion : {"twomeans", "fastbic"}, default="twomeans"
The function to measure the quality of a split. Supported criteria are
"twomeans" for maximizing the variance and "fastbic" for the
maximizing the Bayesian Information Criterion (BIC), see
:ref:`tree_mathematical_formulation`.
max_depth : int, default=None
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
min_samples_split : int or float, default=2
The minimum number of samples required to split an internal node:
- If int, then consider `min_samples_split` as the minimum number.
- If float, then `min_samples_split` is a fraction and
`ceil(min_samples_split * n_samples)` are the minimum
number of samples for each split.
In unsupervised trees, it is recommended by :footcite:`Meghana2019_geodesicrf`
to use the sqrt of two times the number of samples in the dataset.
min_samples_leaf : int or float, default=1
The minimum number of samples required to be at a leaf node.
A split point at any depth will only be considered if it leaves at
least ``min_samples_leaf`` training samples in each of the left and
right branches. This may have the effect of smoothing the model,
especially in regression.
- If int, then consider `min_samples_leaf` as the minimum number.
- If float, then `min_samples_leaf` is a fraction and
`ceil(min_samples_leaf * n_samples)` are the minimum
number of samples for each node.
min_weight_fraction_leaf : float, default=0.0
The minimum weighted fraction of the sum total of weights (of all
the input samples) required to be at a leaf node. Samples have
equal weight when sample_weight is not provided.
max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
max_leaf_nodes : int, default=None
Grow trees with ``max_leaf_nodes`` in best-first fashion.
Best nodes are defined as relative reduction in impurity.
If None then unlimited number of leaf nodes.
min_impurity_decrease : float, default=0.0
A node will be split if this split induces a decrease of the impurity
greater than or equal to this value.
The weighted impurity decrease equation is the following::
N_t / N * (impurity - N_t_R / N_t * right_impurity
- N_t_L / N_t * left_impurity)
where ``N`` is the total number of samples, ``N_t`` is the number of
samples at the current node, ``N_t_L`` is the number of samples in the
left child, and ``N_t_R`` is the number of samples in the right child.
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
if ``sample_weight`` is passed.
bootstrap : bool, optional
Whether to bootstrap, by default False.
oob_score : bool or callable, default=False
Whether to use out-of-bag samples to estimate the generalization score.
By default, :func:`~sklearn.metrics.calinski_harabasz_score` is used.
Provide a callable with signature `metric(X, predicted_labels)` to use a
custom metric. Only available if `bootstrap=True`. Other supported functions
from scikit-learn are :func:`sklearn.metrics.silhouette_score`,
:func:`sklearn.metrics.calinski_harabasz_score`, and
:func:`sklearn.metrics.davies_bouldin_score`.
n_jobs : int, optional
Number of CPUs to use in `joblib` parallelization for constructing trees,
by default None.
random_state : int, optional
Random seed, by default None.
verbose : int, optional
Verbosity, by default 0.
warm_start : bool, optional
Whether to continue constructing trees from previous instant, by default False.
max_samples : int or float, default=None
If bootstrap is True, the number of samples to draw from X
to train each base estimator.
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0.0, 1.0]`.
clustering_func : callable
Scikit-learn compatible clustering function to take the affinity matrix
and return cluster labels. By default, :class:`sklearn.cluster.AgglomerativeClustering`.
clustering_func_args : dict
Clustering function class keyword arguments. Passed to `clustering_func`.
Attributes
----------
estimator_ : UnsupervisedDecisionTree
The child estimator template used to create the collection of fitted
sub-estimators.
estimators_ : list of UnsupervisedDecisionTree
The collection of fitted sub-estimators.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
feature_importances_ : ndarray of shape (n_features,)
The impurity-based feature importances.
The higher, the more important the feature.
The importance of a feature is computed as the (normalized)
total reduction of the criterion brought by that feature. It is also
known as the Gini importance.
Warning: impurity-based feature importances can be misleading for
high cardinality features (many unique values). See
:func:`sklearn.inspection.permutation_importance` as an alternative.
labels_ : ndarray of shape (n_samples,)
Labels of each point.
similarity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the affinity/similarity matrix used in fit. Note this matrix
is computed from within-bag and OOB samples.
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the dissimilarity matrix used in fit. Note this matrix
is computed from within-bag and OOB samples.
oob_score_ : float
Score of the training dataset obtained using an out-of-bag estimate.
This attribute exists only when ``oob_score`` is True.
oob_decision_function_ : ndarray of shape (n_samples, n_samples)
Affinity matrix computed with only out-of-bag estimate on the training
set. If n_estimators is small it might be possible that a data point
was never left out during the bootstrap. In this case,
`oob_decision_function_` might contain NaN. This attribute exists
only when ``oob_score`` is True.
References
----------
.. footbibliography::
"""
def __init__(
self,
n_estimators=100,
*,
criterion="twomeans",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features="sqrt",
max_leaf_nodes=None,
min_impurity_decrease=0.0,
bootstrap=False,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
max_samples=None,
clustering_func=None,
clustering_func_args=None,
) -> None:
super().__init__(
estimator=UnsupervisedDecisionTree(), # type: ignore
n_estimators=n_estimators,
estimator_params=(
"criterion",
"max_depth",
"min_samples_split",
"min_samples_leaf",
"min_weight_fraction_leaf",
"max_features",
"max_leaf_nodes",
"min_impurity_decrease",
"random_state",
),
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
max_samples=max_samples,
)
self.criterion = criterion
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_fraction_leaf = min_weight_fraction_leaf
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.clustering_func = clustering_func
self.clustering_func_args = clustering_func_args
[docs]class UnsupervisedObliqueRandomForest(ForestCluster):
"""Unsupervised oblique random forest.
An unsupervised random forest is inherently a clustering algorithm that also
simultaneously computes an adaptive affinity matrix that is based on the 0-1
tree distance (i.e. do samples fall within the same leaf).
Parameters
----------
n_estimators : int, optional
Number of trees to fit, by default 100.
criterion : {"twomeans", "fastbic"}, default="twomeans"
The function to measure the quality of a split. Supported criteria are
"twomeans" for maximizing the variance and "fastbic" for the
maximizing the Bayesian Information Criterion (BIC), see
:ref:`tree_mathematical_formulation`.
max_depth : int, default=None
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
min_samples_split : int or float, default=2
The minimum number of samples required to split an internal node:
- If int, then consider `min_samples_split` as the minimum number.
- If float, then `min_samples_split` is a fraction and
`ceil(min_samples_split * n_samples)` are the minimum
number of samples for each split.
In unsupervised trees, it is recommended by :footcite:`Meghana2019_geodesicrf`
to use the sqrt of two times the number of samples in the dataset.
min_samples_leaf : int or float, default=1
The minimum number of samples required to be at a leaf node.
A split point at any depth will only be considered if it leaves at
least ``min_samples_leaf`` training samples in each of the left and
right branches. This may have the effect of smoothing the model,
especially in regression.
- If int, then consider `min_samples_leaf` as the minimum number.
- If float, then `min_samples_leaf` is a fraction and
`ceil(min_samples_leaf * n_samples)` are the minimum
number of samples for each node.
min_weight_fraction_leaf : float, default=0.0
The minimum weighted fraction of the sum total of weights (of all
the input samples) required to be at a leaf node. Samples have
equal weight when sample_weight is not provided.
max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
max_leaf_nodes : int, default=None
Grow trees with ``max_leaf_nodes`` in best-first fashion.
Best nodes are defined as relative reduction in impurity.
If None then unlimited number of leaf nodes.
min_impurity_decrease : float, default=0.0
A node will be split if this split induces a decrease of the impurity
greater than or equal to this value.
The weighted impurity decrease equation is the following::
N_t / N * (impurity - N_t_R / N_t * right_impurity
- N_t_L / N_t * left_impurity)
where ``N`` is the total number of samples, ``N_t`` is the number of
samples at the current node, ``N_t_L`` is the number of samples in the
left child, and ``N_t_R`` is the number of samples in the right child.
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
if ``sample_weight`` is passed.
bootstrap : bool, optional
Whether to bootstrap, by default False.
oob_score : bool or callable, default=False
Whether to use out-of-bag samples to estimate the generalization score.
By default, :func:`~sklearn.metrics.calinski_harabasz_score` is used.
Provide a callable with signature `metric(X, predicted_labels)` to use a
custom metric. Only available if `bootstrap=True`. Other supported functions
from scikit-learn are :func:`sklearn.metrics.silhouette_score`,
:func:`sklearn.metrics.calinski_harabasz_score`, and
:func:`sklearn.metrics.davies_bouldin_score`.
n_jobs : int, optional
Number of CPUs to use in `joblib` parallelization for constructing trees,
by default None.
random_state : int, optional
Random seed, by default None.
verbose : int, optional
Verbosity, by default 0.
warm_start : bool, optional
Whether to continue constructing trees from previous instant, by default False.
max_samples : int or float, default=None
If bootstrap is True, the number of samples to draw from X
to train each base estimator.
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0.0, 1.0]`.
feature_combinations : float, default=1.5
The number of features to combine on average at each split
of the decision trees.
clustering_func : callable
Scikit-learn compatible clustering function to take the affinity matrix
and return cluster labels. By default, :class:`sklearn.cluster.AgglomerativeClustering`.
clustering_func_args : dict
Clustering function class keyword arguments. Passed to `clustering_func`.
Attributes
----------
estimator_ : UnsupervisedDecisionTree
The child estimator template used to create the collection of fitted
sub-estimators.
estimators_ : list of UnsupervisedDecisionTree
The collection of fitted sub-estimators.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
feature_importances_ : ndarray of shape (n_features,)
The impurity-based feature importances.
The higher, the more important the feature.
The importance of a feature is computed as the (normalized)
total reduction of the criterion brought by that feature. It is also
known as the Gini importance.
Warning: impurity-based feature importances can be misleading for
high cardinality features (many unique values). See
:func:`sklearn.inspection.permutation_importance` as an alternative.
labels_ : ndarray of shape (n_samples,)
Labels of each point.
similarity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the affinity/similarity matrix used in fit. Note this matrix
is computed from within-bag and OOB samples.
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the dissimilarity matrix used in fit. Note this matrix
is computed from within-bag and OOB samples.
oob_score_ : float
Score of the training dataset obtained using an out-of-bag estimate.
This attribute exists only when ``oob_score`` is True.
oob_decision_function_ : ndarray of shape (n_samples, n_samples)
Affinity matrix computed with only out-of-bag estimate on the training
set. If n_estimators is small it might be possible that a data point
was never left out during the bootstrap. In this case,
`oob_decision_function_` might contain NaN. This attribute exists
only when ``oob_score`` is True.
"""
tree_type = "oblique"
def __init__(
self,
n_estimators=100,
*,
criterion="twomeans",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features="sqrt",
max_leaf_nodes=None,
min_impurity_decrease=0.0,
bootstrap=False,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
max_samples=None,
feature_combinations=1.5,
clustering_func=None,
clustering_func_args=None,
) -> None:
super().__init__(
estimator=UnsupervisedObliqueDecisionTree(), # type: ignore
n_estimators=n_estimators,
estimator_params=(
"criterion",
"max_depth",
"min_samples_split",
"min_samples_leaf",
"min_weight_fraction_leaf",
"max_features",
"max_leaf_nodes",
"min_impurity_decrease",
"feature_combinations",
"random_state",
),
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
max_samples=max_samples,
)
self.criterion = criterion
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_fraction_leaf = min_weight_fraction_leaf
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.feature_combinations = feature_combinations
self.clustering_func = clustering_func
self.clustering_func_args = clustering_func_args