Source code for sktree.neighbors

import numbers
from copy import copy

import numpy as np
from sklearn.base import BaseEstimator, MetaEstimatorMixin
from sklearn.exceptions import NotFittedError
from sklearn.neighbors import NearestNeighbors
from sklearn.utils.validation import check_is_fitted

from sktree.tree._neighbors import _compute_distance_matrix, compute_forest_similarity_matrix


[docs]class NearestNeighborsMetaEstimator(BaseEstimator, MetaEstimatorMixin): """Meta-estimator for nearest neighbors. Uses a decision-tree, or forest model to compute distances between samples and then uses the sklearn's nearest-neighbors API to compute neighbors. Parameters ---------- estimator : BaseDecisionTree, BaseForest The estimator to use for computing distances. n_neighbors : int, optional Number of neighbors to use by default for kneighbors queries, by default 5. radius : float, optional Range of parameter space to use by default for radius_neighbors queries, by default 1.0. algorithm : str, optional Algorithm used to compute the nearest-neighbors, by default 'auto'. See :class:`sklearn.neighbors.NearestNeighbors` for details. n_jobs : int, optional The number of parallel jobs to run for neighbors, by default None. """ def __init__(self, estimator, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None): self.estimator = estimator self.n_neighbors = n_neighbors self.algorithm = algorithm self.radius = radius self.n_jobs = n_jobs
[docs] def fit(self, X, y=None): """Fit the nearest neighbors estimator from the training dataset. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values, by default None. Returns ------- self : object Fitted estimator. """ X, y = self._validate_data(X, y, accept_sparse="csc") self.estimator_ = copy(self.estimator) try: check_is_fitted(self.estimator_) except NotFittedError: self.estimator_.fit(X, y) self._fit(X, self.n_neighbors) return self
def _fit(self, X, n_neighbors): self.neigh_est_ = NearestNeighbors( n_neighbors=n_neighbors, algorithm=self.algorithm, metric="precomputed", n_jobs=self.n_jobs, ) # compute the distance matrix aff_matrix = compute_forest_similarity_matrix(self.estimator_, X) dists = _compute_distance_matrix(aff_matrix) # fit the nearest-neighbors estimator self.neigh_est_.fit(dists)
[docs] def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Find the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : {array-like, sparse matrix}, shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed', default=None Not used, present for API consistency by convention. n_neighbors : int, default=None Number of neighbors required for each sample. The default is the value passed to the constructor. return_distance : bool, default=True Whether or not to return the distances. Returns ------- neigh_dist : ndarray of shape (n_queries, n_neighbors) Array representing the lengths to points, only present if return_distance=True. neigh_ind : ndarray of shape (n_queries, n_neighbors) Indices of the nearest points in the population matrix. """ check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) elif not isinstance(n_neighbors, numbers.Integral): raise TypeError( "n_neighbors does not take %s value, enter integer value" % type(n_neighbors) ) if X is not None: self._fit(X, n_neighbors) return self.neigh_est_.kneighbors(n_neighbors=n_neighbors, return_distance=return_distance)
[docs] def radius_neighbors(self, X=None, radius=None, return_distance=True, sort_results=False): """Find the neighbors within a given radius of a point or points. Return the indices and distances of each point from the dataset lying in a ball with size ``radius`` around the points of the query array. Points lying on the boundary are included in the results. The result points are *not* necessarily sorted by distance to their query point. Parameters ---------- X : {array-like, sparse matrix} of (n_samples, n_features), default=None The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. radius : float, or array-like of shape (n_samples,) default=None Limiting distance of neighbors to return. The default is the value passed to the constructor. If an array-like of shape (n_samples), then will query for each sample point with a different radius. return_distance : bool, default=True Whether or not to return the distances. sort_results : bool, default=False If True, the distances and indices will be sorted by increasing distances before being returned. If False, the results may not be sorted. If `return_distance=False`, setting `sort_results=True` will result in an error. .. versionadded:: 0.22 Returns ------- neigh_dist : ndarray of shape (n_samples,) of arrays Array representing the distances to each point, only present if `return_distance=True`. The distance values are computed according to the ``metric`` constructor parameter. neigh_ind : ndarray of shape (n_samples,) of arrays An array of arrays of indices of the approximate nearest points from the population matrix that lie within a ball of size ``radius`` around the query points. Notes ----- Because the number of neighbors of each point is not necessarily equal, the results for multiple query points cannot be fit in a standard data array. For efficiency, `radius_neighbors` returns arrays of objects, where each object is a 1D array of indices or distances. """ check_is_fitted(self) if X is not None: n_samples = X.shape[0] else: n_samples = self.neigh_est_.n_samples_fit_ if isinstance(radius, numbers.Number): radius = [radius] * n_samples # now construct nearest neighbor indices and distances within radius nn_ind_data = np.zeros((n_samples,), dtype=object) nn_dist_data = np.zeros((n_samples,), dtype=object) for idx in range(n_samples): nn = self.neigh_est_.radius_neighbors( X=X, radius=radius[idx], return_distance=return_distance, sort_results=sort_results ) if return_distance: nn_ind_data[idx] = nn[0][idx] nn_dist_data[idx] = nn[1][idx] else: nn_ind_data[idx] = nn if return_distance: return nn_dist_data, nn_ind_data return nn_ind_data