Source code for sktree.tree._neighbors
import numpy as np
[docs]def compute_forest_similarity_matrix(forest, X):
"""Compute the similarity matrix of samples in X using a trained forest.
As an intermediate calculation, the forest applies the dataset and gets
the leaves for each sample. Then, the similarity matrix is computed by
counting the number of times each pair of samples ends up in the same leaf.
Parameters
----------
forest : BaseForest or BaseDecisionTree
The fitted forest.
X : array-like of shape (n_samples, n_features)
The input data.
Returns
-------
aff_matrix : array-like of shape (n_samples, n_samples)
The estimated distance matrix.
"""
if hasattr(forest, "estimator_"):
# apply to the leaves
X_leaves = forest.apply(X)
n_est = forest.n_estimators
else:
# apply to the leaves for a single tree
X_leaves = forest.apply(X)[:, np.newaxis]
n_est = 1
aff_matrix = sum(np.equal.outer(X_leaves[:, i], X_leaves[:, i]) for i in range(n_est))
# normalize by the number of trees
aff_matrix = np.divide(aff_matrix, n_est)
return aff_matrix
def _compute_distance_matrix(aff_matrix):
"""Private function to compute distance matrix after `compute_similarity_matrix`."""
dists = 1.0 - aff_matrix
return dists
# ported from https://github.com/neurodata/hyppo/blob/main/hyppo/independence/_utils.py
class SimMatrixMixin:
"""Mixin class to calculate similarity and dissimilarity matrices.
This augments tree/forest models with the sklearn's nearest-neighbors API.
"""
def compute_similarity_matrix(self, X):
"""
Compute the similarity matrix of samples in X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data.
Returns
-------
sim_matrix : array-like of shape (n_samples, n_samples)
The similarity matrix among the samples.
"""
return compute_forest_similarity_matrix(self, X)