Source code for sktree.neighbors

import numbers
from copy import copy

import numpy as np
from sklearn.base import BaseEstimator, MetaEstimatorMixin
from sklearn.exceptions import NotFittedError
from sklearn.neighbors import NearestNeighbors
from sklearn.utils.validation import check_is_fitted

from sktree.tree._neighbors import _compute_distance_matrix, compute_forest_similarity_matrix


[docs]class NearestNeighborsMetaEstimator(BaseEstimator, MetaEstimatorMixin):
    """Meta-estimator for nearest neighbors.

    Uses a decision-tree, or forest model to compute distances between samples
    and then uses the sklearn's nearest-neighbors API to compute neighbors.

    Parameters
    ----------
    estimator : BaseDecisionTree, BaseForest
        The estimator to use for computing distances.
    n_neighbors : int, optional
        Number of neighbors to use by default for kneighbors queries, by default 5.
    radius : float, optional
        Range of parameter space to use by default for radius_neighbors queries, by default 1.0.
    algorithm : str, optional
        Algorithm used to compute the nearest-neighbors, by default 'auto'.
        See :class:`sklearn.neighbors.NearestNeighbors` for details.
    n_jobs : int, optional
        The number of parallel jobs to run for neighbors, by default None.
    """

    def __init__(self, estimator, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None):
        self.estimator = estimator
        self.n_neighbors = n_neighbors
        self.algorithm = algorithm
        self.radius = radius
        self.n_jobs = n_jobs

[docs]    def fit(self, X, y=None):
        """Fit the nearest neighbors estimator from the training dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.

        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values, by default None.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        X, y = self._validate_data(X, y, accept_sparse="csc")

        self.estimator_ = copy(self.estimator)
        try:
            check_is_fitted(self.estimator_)
        except NotFittedError:
            self.estimator_.fit(X, y)

        self._fit(X, self.n_neighbors)
        return self

    def _fit(self, X, n_neighbors):
        self.neigh_est_ = NearestNeighbors(
            n_neighbors=n_neighbors,
            algorithm=self.algorithm,
            metric="precomputed",
            n_jobs=self.n_jobs,
        )

        # compute the distance matrix
        aff_matrix = compute_forest_similarity_matrix(self.estimator_, X)
        dists = _compute_distance_matrix(aff_matrix)

        # fit the nearest-neighbors estimator
        self.neigh_est_.fit(dists)

[docs]    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
        """Find the K-neighbors of a point.

        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_queries, n_features), \
            or (n_queries, n_indexed) if metric == 'precomputed', default=None
            Not used, present for API consistency by convention.

        n_neighbors : int, default=None
            Number of neighbors required for each sample. The default is the
            value passed to the constructor.

        return_distance : bool, default=True
            Whether or not to return the distances.

        Returns
        -------
        neigh_dist : ndarray of shape (n_queries, n_neighbors)
            Array representing the lengths to points, only present if
            return_distance=True.

        neigh_ind : ndarray of shape (n_queries, n_neighbors)
            Indices of the nearest points in the population matrix.
        """
        check_is_fitted(self)

        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        elif n_neighbors <= 0:
            raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
        elif not isinstance(n_neighbors, numbers.Integral):
            raise TypeError(
                "n_neighbors does not take %s value, enter integer value" % type(n_neighbors)
            )

        if X is not None:
            self._fit(X, n_neighbors)

        return self.neigh_est_.kneighbors(n_neighbors=n_neighbors, return_distance=return_distance)

[docs]    def radius_neighbors(self, X=None, radius=None, return_distance=True, sort_results=False):
        """Find the neighbors within a given radius of a point or points.

        Return the indices and distances of each point from the dataset
        lying in a ball with size ``radius`` around the points of the query
        array. Points lying on the boundary are included in the results.

        The result points are *not* necessarily sorted by distance to their
        query point.

        Parameters
        ----------
        X : {array-like, sparse matrix} of (n_samples, n_features), default=None
            The query point or points.
            If not provided, neighbors of each indexed point are returned.
            In this case, the query point is not considered its own neighbor.

        radius : float, or array-like of shape (n_samples,) default=None
            Limiting distance of neighbors to return. The default is the value
            passed to the constructor. If an array-like of shape (n_samples),
            then will query for each sample point with a different radius.

        return_distance : bool, default=True
            Whether or not to return the distances.

        sort_results : bool, default=False
            If True, the distances and indices will be sorted by increasing
            distances before being returned. If False, the results may not
            be sorted. If `return_distance=False`, setting `sort_results=True`
            will result in an error.

            .. versionadded:: 0.22

        Returns
        -------
        neigh_dist : ndarray of shape (n_samples,) of arrays
            Array representing the distances to each point, only present if
            `return_distance=True`. The distance values are computed according
            to the ``metric`` constructor parameter.

        neigh_ind : ndarray of shape (n_samples,) of arrays
            An array of arrays of indices of the approximate nearest points
            from the population matrix that lie within a ball of size
            ``radius`` around the query points.

        Notes
        -----
        Because the number of neighbors of each point is not necessarily
        equal, the results for multiple query points cannot be fit in a
        standard data array.
        For efficiency, `radius_neighbors` returns arrays of objects, where
        each object is a 1D array of indices or distances.
        """
        check_is_fitted(self)

        if X is not None:
            n_samples = X.shape[0]
        else:
            n_samples = self.neigh_est_.n_samples_fit_

        if isinstance(radius, numbers.Number):
            radius = [radius] * n_samples

        # now construct nearest neighbor indices and distances within radius
        nn_ind_data = np.zeros((n_samples,), dtype=object)
        nn_dist_data = np.zeros((n_samples,), dtype=object)
        for idx in range(n_samples):
            nn = self.neigh_est_.radius_neighbors(
                X=X, radius=radius[idx], return_distance=return_distance, sort_results=sort_results
            )

            if return_distance:
                nn_ind_data[idx] = nn[0][idx]
                nn_dist_data[idx] = nn[1][idx]
            else:
                nn_ind_data[idx] = nn

        if return_distance:
            return nn_dist_data, nn_ind_data
        return nn_ind_data