Source code for sktree.ensemble._unsupervised_forest
"""Manifold forest of trees-based ensemble methods.Those methods include various random forest methods that operate on manifolds."""# Authors: Adam Li <adam2392@gmail.com>## License: BSD 3 clausefromwarningsimportwarnimportnumpyasnpfromscipy.sparseimportissparsefromsklearn.baseimportClusterMixin,TransformerMixinfromsklearn.clusterimportAgglomerativeClusteringfromsklearn.ensemble._forestimport(MAX_INT,_generate_unsampled_indices,_get_n_samples_bootstrap,_parallel_build_trees,)fromsklearn.metricsimportcalinski_harabasz_scorefromsklearn.utils.parallelimportParallel,delayedfromsklearn.utils.validationimport_check_sample_weight,check_is_fitted,check_random_statefrom.._lib.sklearn.ensemble._forestimportBaseForestfrom.._lib.sklearn.tree._treeimportDTYPEfrom..treeimportUnsupervisedDecisionTree,UnsupervisedObliqueDecisionTreefrom..tree._neighborsimportSimMatrixMixinclassForestCluster(SimMatrixMixin,TransformerMixin,ClusterMixin,BaseForest):"""Unsupervised forest base class."""def__init__(self,estimator,n_estimators=100,*,estimator_params=tuple(),bootstrap=False,oob_score=False,n_jobs=None,random_state=None,verbose=0,warm_start=False,max_samples=None,)->None:super().__init__(estimator=estimator,n_estimators=n_estimators,estimator_params=estimator_params,bootstrap=bootstrap,oob_score=oob_score,n_jobs=n_jobs,random_state=random_state,verbose=verbose,warm_start=warm_start,max_samples=max_samples,)deffit(self,X,y=None,sample_weight=None):""" Fit estimator. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Use ``dtype=np.float32`` for maximum efficiency. Sparse matrices are also supported, use sparse ``csc_matrix`` for maximum efficiency. y : Ignored Not used, present for API consistency by convention. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object Returns the instance itself. """self._validate_params()# Validate or convert input dataX=self._validate_data(X,dtype=DTYPE,# accept_sparse="csc",)ifsample_weightisnotNone:sample_weight=_check_sample_weight(sample_weight,X)ifissparse(X):# Pre-sort indices to avoid that each individual tree of the# ensemble sorts the indices.X.sort_indices()ifnotself.bootstrapandself.max_samplesisnotNone:raiseValueError("`max_sample` cannot be set if `bootstrap=False`. ""Either switch to `bootstrap=True` or set ""`max_sample=None`.")elifself.bootstrap:n_samples_bootstrap=_get_n_samples_bootstrap(n_samples=X.shape[0],max_samples=self.max_samples)else:n_samples_bootstrap=Noneself._validate_estimator()ifnotself.bootstrapandself.oob_score:raiseValueError("Out of bag estimation only available if bootstrap=True")random_state=check_random_state(self.random_state)ifnotself.warm_startornothasattr(self,"estimators_"):# Free allocated memory, if anyself.estimators_=[]n_more_estimators=self.n_estimators-len(self.estimators_)ifn_more_estimators<0:raiseValueError("n_estimators=%d must be larger or equal to ""len(estimators_)=%d when warm_start==True"%(self.n_estimators,len(self.estimators_)))elifn_more_estimators==0:warn("Warm-start fitting without increasing n_estimators does not ""fit new trees.")else:ifself.warm_startandlen(self.estimators_)>0:# We draw from the random state to get the random state we# would have got if we hadn't used a warm_start.random_state.randint(MAX_INT,size=len(self.estimators_))trees=[self._make_estimator(append=False,random_state=random_state)foriinrange(n_more_estimators)]# Parallel loop: we prefer the threading backend as the Cython code# for fitting the trees is internally releasing the Python GIL# making threading more efficient than multiprocessing in# that case. However, for joblib 0.12+ we respect any# parallel_backend contexts set at a higher level,# since correctness does not rely on using threads.trees=Parallel(n_jobs=self.n_jobs,verbose=self.verbose,prefer="threads",)(delayed(_parallel_build_trees)(t,self.bootstrap,X,y,sample_weight,i,len(trees),verbose=self.verbose,class_weight=self.class_weight,n_samples_bootstrap=n_samples_bootstrap,)fori,tinenumerate(trees))# Collect newly grown treesself.estimators_.extend(trees)ifself.oob_score:ifcallable(self.oob_score):self._set_oob_score_and_attributes(X,scoring_function=self.oob_score)else:self._set_oob_score_and_attributes(X)# now compute the similarity/dissimilarity matrix and set itsim_mat=self.compute_similarity_matrix(X)# compute the labels and set itself.labels_=self._assign_labels(sim_mat)returnselfdefpredict(self,X):"""Predict clusters for X. The predicted class of an input sample is a vote by the trees in the forest, weighted by their probability estimates. That is, the predicted class is the one with highest mean probability estimate across the trees. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """X=self._validate_X_predict(X)similarity_matrix=self.transform(X)# compute the labels and set itreturnself._assign_labels(similarity_matrix)deftransform(self,X):"""Transform X to a cluster-distance space. In the new space, each dimension is the distance to the cluster centers. Note that even if X is sparse, the array returned by `transform` will typically be dense. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) New data to transform. Returns ------- X_new : ndarray of shape (n_samples, n_samples) X transformed in the new space. """check_is_fitted(self)# now compute the affinity matrix and set itsimilarity_matrix=self.compute_similarity_matrix(X)returnsimilarity_matrixdef_assign_labels(self,similarity_matrix):"""Assign cluster labels given X. Parameters ---------- similarity_matrix : ndarray of shape (n_samples, n_samples) The affinity matrix. Returns ------- predict_labels : ndarray of shape (n_samples,) The predicted cluster labels """ifself.clustering_funcisNone:self.clustering_func_=AgglomerativeClusteringelse:self.clustering_func_=self.clustering_funcifself.clustering_func_argsisNone:self.clustering_func_args_=dict()else:self.clustering_func_args_=self.clustering_func_argscluster=self.clustering_func_(**self.clustering_func_args_)# apply agglomerative clustering to obtain cluster labelspredict_labels=cluster.fit_predict(similarity_matrix)returnpredict_labels@staticmethoddef_get_oob_predictions(tree,X):"""Compute the OOB transformations for an individual tree. Parameters ---------- tree : UnsupervisedDecisionTree object A single unsupervised decision tree model. X : ndarray of shape (n_samples, n_features) The OOB samples. Returns ------- tree_prox_matrix : ndarray of shape (n_samples, n_samples) The OOB associated proximity matrix. """# transform X# now compute the affinity matrix and set ittree_prox_matrix=tree.compute_similarity_matrix_forest(X)returntree_prox_matrixdef_compute_oob_predictions(self,X,y=None):"""Compute the OOB transformations. This only uses the OOB samples per tree to compute the unnormalized proximity matrix. These submatrices are then aggregated into the whole proximity matrix and normalized based on how many times each sample showed up in an OOB tree. Parameters ---------- X : array-like of shape (n_samples, n_features) The data matrix. y : ndarray of shape (n_samples, n_outputs) Not used. Returns ------- oob_pred : ndarray of shape (n_samples, n_samples) The OOB proximity matrix. """# Prediction requires X to be in CSR formatifissparse(X):X=X.tocsr()n_samples=X.shape[0]# for clustering, n_classes_ does not exist and we create an empty# axis to be consistent with the classification case and make# the array operations compatible with the 2 settingsoob_pred_shape=(n_samples,n_samples)oob_pred=np.zeros(shape=oob_pred_shape,dtype=np.float64)n_oob_pred=np.zeros((n_samples,n_samples),dtype=np.int64)n_samples_bootstrap=_get_n_samples_bootstrap(n_samples,self.max_samples,)forestimatorinself.estimators_:unsampled_indices=_generate_unsampled_indices(estimator.random_state,n_samples,n_samples_bootstrap,)tree_prox_matrix=self._get_oob_predictions(estimator,X[unsampled_indices,:])oob_pred[np.ix_(unsampled_indices,unsampled_indices)]+=tree_prox_matrixn_oob_pred[np.ix_(unsampled_indices,unsampled_indices)]+=1if(n_oob_pred==0).any():warn("Some inputs do not have OOB scores. This probably means ""too few trees were used to compute any reliable OOB ""estimates.",UserWarning,)n_oob_pred[n_oob_pred==0]=1# normalize by the number of times each oob sample proximity matrix was computedoob_pred/=n_oob_predreturnoob_preddef_set_oob_score_and_attributes(self,X,y,scoring_function=None):"""Compute and set the OOB score and attributes. Parameters ---------- X : array-like of shape (n_samples, n_features) The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. scoring_function : callable, default=None Scoring function for OOB score. Default is the :func:`sklearn.metrics.calinski_harabasz_score`. Must not require true ``y_labels``. """self.oob_decision_function_=self._compute_oob_predictions(X)ifscoring_functionisNone:scoring_function=calinski_harabasz_score# assign labelspredict_labels=self._assign_labels(self.oob_decision_function_)self.oob_labels_=predict_labelsself.oob_score_=scoring_function(X,predict_labels)
[docs]classUnsupervisedRandomForest(ForestCluster):"""Unsupervised random forest. An unsupervised random forest is inherently a clustering algorithm that also simultaneously computes an adaptive affinity matrix that is based on the 0-1 tree distance (i.e. do samples fall within the same leaf). Parameters ---------- n_estimators : int, optional Number of trees to fit, by default 100. criterion : {"twomeans", "fastbic"}, default="twomeans" The function to measure the quality of a split. Supported criteria are "twomeans" for maximizing the variance and "fastbic" for the maximizing the Bayesian Information Criterion (BIC), see :ref:`tree_mathematical_formulation`. max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. In unsupervised trees, it is recommended by :footcite:`Meghana2019_geodesicrf` to use the sqrt of two times the number of samples in the dataset. min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features : {"sqrt", "log2", None}, int or float, default="sqrt" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `max(1, int(max_features * n_features_in_))` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. bootstrap : bool, optional Whether to bootstrap, by default False. oob_score : bool or callable, default=False Whether to use out-of-bag samples to estimate the generalization score. By default, :func:`~sklearn.metrics.calinski_harabasz_score` is used. Provide a callable with signature `metric(X, predicted_labels)` to use a custom metric. Only available if `bootstrap=True`. Other supported functions from scikit-learn are :func:`sklearn.metrics.silhouette_score`, :func:`sklearn.metrics.calinski_harabasz_score`, and :func:`sklearn.metrics.davies_bouldin_score`. n_jobs : int, optional Number of CPUs to use in `joblib` parallelization for constructing trees, by default None. random_state : int, optional Random seed, by default None. verbose : int, optional Verbosity, by default 0. warm_start : bool, optional Whether to continue constructing trees from previous instant, by default False. max_samples : int or float, default=None If bootstrap is True, the number of samples to draw from X to train each base estimator. - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`. clustering_func : callable Scikit-learn compatible clustering function to take the affinity matrix and return cluster labels. By default, :class:`sklearn.cluster.AgglomerativeClustering`. clustering_func_args : dict Clustering function class keyword arguments. Passed to `clustering_func`. Attributes ---------- estimator_ : UnsupervisedDecisionTree The child estimator template used to create the collection of fitted sub-estimators. estimators_ : list of UnsupervisedDecisionTree The collection of fitted sub-estimators. n_features_in_ : int Number of features seen during :term:`fit`. feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative. labels_ : ndarray of shape (n_samples,) Labels of each point. similarity_matrix_ : ndarray of shape (n_samples, n_samples) Stores the affinity/similarity matrix used in fit. Note this matrix is computed from within-bag and OOB samples. dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples) Stores the dissimilarity matrix used in fit. Note this matrix is computed from within-bag and OOB samples. oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when ``oob_score`` is True. oob_decision_function_ : ndarray of shape (n_samples, n_samples) Affinity matrix computed with only out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, `oob_decision_function_` might contain NaN. This attribute exists only when ``oob_score`` is True. References ---------- .. footbibliography:: """def__init__(self,n_estimators=100,*,criterion="twomeans",max_depth=None,min_samples_split="sqrt",min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_features="sqrt",max_leaf_nodes=None,min_impurity_decrease=0.0,bootstrap=False,oob_score=False,n_jobs=None,random_state=None,verbose=0,warm_start=False,max_samples=None,clustering_func=None,clustering_func_args=None,)->None:super().__init__(estimator=UnsupervisedDecisionTree(),# type: ignoren_estimators=n_estimators,estimator_params=("criterion","max_depth","min_samples_split","min_samples_leaf","min_weight_fraction_leaf","max_features","max_leaf_nodes","min_impurity_decrease","random_state",),bootstrap=bootstrap,oob_score=oob_score,n_jobs=n_jobs,random_state=random_state,verbose=verbose,warm_start=warm_start,max_samples=max_samples,)self.criterion=criterionself.max_depth=max_depthself.min_samples_split=min_samples_splitself.min_samples_leaf=min_samples_leafself.min_weight_fraction_leaf=min_weight_fraction_leafself.max_features=max_featuresself.max_leaf_nodes=max_leaf_nodesself.min_impurity_decrease=min_impurity_decreaseself.clustering_func=clustering_funcself.clustering_func_args=clustering_func_args
[docs]classUnsupervisedObliqueRandomForest(ForestCluster):"""Unsupervised oblique random forest. An unsupervised random forest is inherently a clustering algorithm that also simultaneously computes an adaptive affinity matrix that is based on the 0-1 tree distance (i.e. do samples fall within the same leaf). Parameters ---------- n_estimators : int, optional Number of trees to fit, by default 100. criterion : {"twomeans", "fastbic"}, default="twomeans" The function to measure the quality of a split. Supported criteria are "twomeans" for maximizing the variance and "fastbic" for the maximizing the Bayesian Information Criterion (BIC), see :ref:`tree_mathematical_formulation`. max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. In unsupervised trees, it is recommended by :footcite:`Meghana2019_geodesicrf` to use the sqrt of two times the number of samples in the dataset. min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features : {"sqrt", "log2", None}, int or float, default="sqrt" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `max(1, int(max_features * n_features_in_))` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. bootstrap : bool, optional Whether to bootstrap, by default False. oob_score : bool or callable, default=False Whether to use out-of-bag samples to estimate the generalization score. By default, :func:`~sklearn.metrics.calinski_harabasz_score` is used. Provide a callable with signature `metric(X, predicted_labels)` to use a custom metric. Only available if `bootstrap=True`. Other supported functions from scikit-learn are :func:`sklearn.metrics.silhouette_score`, :func:`sklearn.metrics.calinski_harabasz_score`, and :func:`sklearn.metrics.davies_bouldin_score`. n_jobs : int, optional Number of CPUs to use in `joblib` parallelization for constructing trees, by default None. random_state : int, optional Random seed, by default None. verbose : int, optional Verbosity, by default 0. warm_start : bool, optional Whether to continue constructing trees from previous instant, by default False. max_samples : int or float, default=None If bootstrap is True, the number of samples to draw from X to train each base estimator. - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`. feature_combinations : float, default=1.5 The number of features to combine on average at each split of the decision trees. clustering_func : callable Scikit-learn compatible clustering function to take the affinity matrix and return cluster labels. By default, :class:`sklearn.cluster.AgglomerativeClustering`. clustering_func_args : dict Clustering function class keyword arguments. Passed to `clustering_func`. Attributes ---------- estimator_ : UnsupervisedDecisionTree The child estimator template used to create the collection of fitted sub-estimators. estimators_ : list of UnsupervisedDecisionTree The collection of fitted sub-estimators. n_features_in_ : int Number of features seen during :term:`fit`. feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative. labels_ : ndarray of shape (n_samples,) Labels of each point. similarity_matrix_ : ndarray of shape (n_samples, n_samples) Stores the affinity/similarity matrix used in fit. Note this matrix is computed from within-bag and OOB samples. dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples) Stores the dissimilarity matrix used in fit. Note this matrix is computed from within-bag and OOB samples. oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when ``oob_score`` is True. oob_decision_function_ : ndarray of shape (n_samples, n_samples) Affinity matrix computed with only out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, `oob_decision_function_` might contain NaN. This attribute exists only when ``oob_score`` is True. """tree_type="oblique"def__init__(self,n_estimators=100,*,criterion="twomeans",max_depth=None,min_samples_split="sqrt",min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_features="sqrt",max_leaf_nodes=None,min_impurity_decrease=0.0,bootstrap=False,oob_score=False,n_jobs=None,random_state=None,verbose=0,warm_start=False,max_samples=None,feature_combinations=1.5,clustering_func=None,clustering_func_args=None,)->None:super().__init__(estimator=UnsupervisedObliqueDecisionTree(),# type: ignoren_estimators=n_estimators,estimator_params=("criterion","max_depth","min_samples_split","min_samples_leaf","min_weight_fraction_leaf","max_features","max_leaf_nodes","min_impurity_decrease","feature_combinations","random_state",),bootstrap=bootstrap,oob_score=oob_score,n_jobs=n_jobs,random_state=random_state,verbose=verbose,warm_start=warm_start,max_samples=max_samples,)self.criterion=criterionself.max_depth=max_depthself.min_samples_split=min_samples_splitself.min_samples_leaf=min_samples_leafself.min_weight_fraction_leaf=min_weight_fraction_leafself.max_features=max_featuresself.max_leaf_nodes=max_leaf_nodesself.min_impurity_decrease=min_impurity_decreaseself.feature_combinations=feature_combinationsself.clustering_func=clustering_funcself.clustering_func_args=clustering_func_args