Source code for sktree.stats.forestht

import threading
from collections import namedtuple
from typing import Callable

import numpy as np
from joblib import Parallel, delayed
from numpy.typing import ArrayLike
from sklearn.base import clone
from sklearn.ensemble._base import _partition_estimators
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.utils.multiclass import type_of_target

from .._lib.sklearn.ensemble._forest import ForestClassifier
from ..tree._classes import DTYPE
from .permuteforest import PermutationHonestForestClassifier
from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman


def _parallel_predict_proba(predict_proba, X, indices_test):
    """
    This is a utility function for joblib's Parallel.

    It can't go locally in ForestClassifier or ForestRegressor, because joblib
    complains that it cannot pickle it when placed there.
    """
    # each tree predicts proba with a list of output (n_samples, n_classes[i])
    prediction = predict_proba(X[indices_test, :], check_input=False)
    return prediction


def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
    """
    This is a utility function for joblib's Parallel.
    It can't go locally in ForestClassifier or ForestRegressor, because joblib
    complains that it cannot pickle it when placed there.
    """
    # each tree predicts proba with a list of output (n_samples, n_classes[i])
    prediction = predict_proba(X, check_input=False)

    indices = np.zeros(X.shape[0], dtype=bool)
    indices[test_idx] = True
    with lock:
        out[idx, test_idx, :] = prediction[test_idx, :]
    return prediction


ForestTestResult = namedtuple(
    "ForestTestResult",
    ["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"],
)


[docs] def build_coleman_forest( est, perm_est, X, y, covariate_index=None, metric="s@98", n_repeats=10_000, verbose=False, seed=None, return_posteriors=True, **metric_kwargs, ): """Build a hypothesis testing forest using a two-forest approach. The two-forest approach stems from the Coleman et al. 2022 paper, where two forests are trained: one on the original dataset, and one on the permuted dataset. The dataset is either permuted once, or independently for each tree in the permuted forest. The original test statistic is computed by comparing the metric on both forests ``(metric_forest - metric_perm_forest)``. For full details, see :footcite:`coleman2022scalable`. Parameters ---------- est : Forest The type of forest to use. Must be enabled with ``bootstrap=True``. perm_est : Forest The forest to use for the permuted dataset. X : ArrayLike of shape (n_samples, n_features) Data. y : ArrayLike of shape (n_samples, n_outputs) Binary target, so ``n_outputs`` should be at most 1. covariate_index : ArrayLike, optional of shape (n_covariates,) The index array of covariates to shuffle, by default None, which defaults to all covariates. metric : str, optional The metric to compute, by default "s@98", for sensitivity at 98% specificity. n_repeats : int, optional Number of times to bootstrap sample the two forests to construct the null distribution, by default 10000. The construction of the null forests will be parallelized according to the ``n_jobs`` argument of the ``est`` forest. verbose : bool, optional Verbosity, by default False. seed : int, optional Random seed, by default None. return_posteriors : bool, optional Whether or not to return the posteriors, by default True. **metric_kwargs : dict, optional Additional keyword arguments to pass to the metric function. Returns ------- observe_stat : float The test statistic. To compute the test statistic, take ``permute_stat_`` and subtract ``observe_stat_``. pvalue : float The p-value of the test statistic. orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each estimator on their out of bag samples. perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each of the permuted estimators on their out of bag samples. null_dist : ArrayLike of shape (n_repeats,) The null statistic differences from permuted forests. References ---------- .. footbibliography:: """ metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] # build two sets of forests est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose) if not isinstance(perm_est, PermutationHonestForestClassifier): raise RuntimeError( f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}" ) perm_est, perm_forest_proba = build_oob_forest( perm_est, X, y, verbose=verbose, covariate_index=covariate_index ) # get the number of jobs n_jobs = est.n_jobs if y.ndim == 1: y = y.reshape(-1, 1) metric_star, metric_star_pi = _compute_null_distribution_coleman( y, orig_forest_proba, perm_forest_proba, metric, n_repeats=n_repeats, seed=seed, n_jobs=n_jobs, **metric_kwargs, ) y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0) y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0) observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs) permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs) # metric^\pi - metric = observed test statistic, which under the # null is normally distributed around 0 observe_test_stat = observe_stat - permute_stat # metric^\pi_j - metric_j, which is centered at 0 null_dist = metric_star_pi - metric_star # compute pvalue if metric in POSITIVE_METRICS: pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats) else: pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats) forest_result = ForestTestResult( observe_test_stat, permute_stat, observe_stat, pvalue, null_dist ) if return_posteriors: return forest_result, orig_forest_proba, perm_forest_proba, est, perm_est else: return forest_result
[docs] def build_permutation_forest( est, perm_est, X, y, covariate_index=None, metric="s@98", n_repeats=500, verbose=False, seed=None, return_posteriors=True, **metric_kwargs, ): """Build a hypothesis testing forest using a permutation-forest approach. The permutation-forest approach stems from standard permutaiton-testing, where each forest is trained on a new permutation of the dataset. The original test statistic is computed on the original data. Then the pvalue is computed by comparing the original test statistic to the null distribution of the test statistic computed from the permuted forests. Parameters ---------- est : Forest The type of forest to use. Must be enabled with ``bootstrap=True``. perm_est : Forest The forest to use for the permuted dataset. Should be ``PermutationHonestForestClassifier``. X : ArrayLike of shape (n_samples, n_features) Data. y : ArrayLike of shape (n_samples, n_outputs) Binary target, so ``n_outputs`` should be at most 1. covariate_index : ArrayLike, optional of shape (n_covariates,) The index array of covariates to shuffle, by default None. metric : str, optional The metric to compute, by default "s@98", for sensitivity at 98% specificity. n_repeats : int, optional Number of times to bootstrap sample the two forests to construct the null distribution, by default 10000. The construction of the null forests will be parallelized according to the ``n_jobs`` argument of the ``est`` forest. verbose : bool, optional Verbosity, by default False. seed : int, optional Random seed, by default None. return_posteriors : bool, optional Whether or not to return the posteriors, by default True. **metric_kwargs : dict, optional Additional keyword arguments to pass to the metric function. Returns ------- observe_stat : float The test statistic. To compute the test statistic, take ``permute_stat_`` and subtract ``observe_stat_``. pvalue : float The p-value of the test statistic. orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each estimator on their out of bag samples. perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each of the permuted estimators on their out of bag samples. References ---------- .. footbibliography:: """ rng = np.random.default_rng(seed) metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] if covariate_index is None: covariate_index = np.arange(X.shape[1], dtype=int) if not isinstance(perm_est, PermutationHonestForestClassifier): raise RuntimeError( f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}" ) # train the original forest on unpermuted data est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose) y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0) observe_test_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs) # get the number of jobs index_arr = np.arange(X.shape[0], dtype=int).reshape(-1, 1) # train many null forests X_perm = X.copy() null_dist = [] for _ in range(n_repeats): rng.shuffle(index_arr) perm_X_cov = X_perm[index_arr, covariate_index] X_perm[:, covariate_index] = perm_X_cov # perm_est = clone(perm_est) perm_est.set_params(random_state=rng.integers(0, np.iinfo(np.int32).max)) perm_est, perm_forest_proba = build_oob_forest( perm_est, X_perm, y, verbose=verbose, covariate_index=covariate_index ) y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0) permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs) null_dist.append(permute_stat) # compute pvalue, which note is opposite that of the Coleman approach, since # we are testing if the null distribution results in a test statistic greater if metric in POSITIVE_METRICS: pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats) else: pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats) forest_result = ForestTestResult(observe_test_stat, permute_stat, None, pvalue, null_dist) if return_posteriors: return forest_result, orig_forest_proba, perm_forest_proba else: return forest_result
[docs] def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): """Build a hypothesis testing forest using oob samples. Parameters ---------- est : Forest The type of forest to use. Must be enabled with ``bootstrap=True``. The forest should have either ``oob_samples_`` or ``estimators_samples_`` property defined, which will be used to compute the out of bag samples per tree. X : ArrayLike of shape (n_samples, n_features) Data. y : ArrayLike of shape (n_samples, n_outputs) Binary target, so ``n_outputs`` should be at most 1. verbose : bool, optional Verbosity, by default False. **est_kwargs : dict, optional Additional keyword arguments to pass to the forest estimator ``fit`` function. Returns ------- est : Forest Fitted forest. all_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each estimator on their out of bag samples. """ assert est.bootstrap assert type_of_target(y) in ("binary") est = clone(est) # build forest est.fit(X, y.ravel(), **est_kwargs) # now evaluate X = est._validate_X_predict(X) # if we trained a binning tree, then we should re-bin the data # XXX: this is inefficient and should be improved to be in line with what # the Histogram Gradient Boosting Tree does, where the binning thresholds # are passed into the tree itself, thus allowing us to set the node feature # value thresholds within the tree itself. if est.max_bins is not None: X = est._bin_data(X, is_training_data=False).astype(DTYPE) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs) # avoid storing the output of every estimator by summing them here lock = threading.Lock() # accumulate the predictions across all trees all_proba = np.full( (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64 ) if hasattr(est, "oob_samples_"): Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_)) ) else: inbag_samples = est.estimators_samples_ all_samples = np.arange(X.shape[0]) oob_samples_list = [ np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples)) ] Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list)) ) return est, all_proba
[docs] def build_cv_forest( est, X, y, cv=5, test_size=0.2, verbose=False, return_indices=False, seed=None, ): """Build a hypothesis testing forest using using cross-validation. Parameters ---------- est : Forest The type of forest to use. Must be enabled with ``bootstrap=True``. X : ArrayLike of shape (n_samples, n_features) Data. y : ArrayLike of shape (n_samples, n_outputs) Binary target, so ``n_outputs`` should be at most 1. cv : int, optional Number of folds to use for cross-validation, by default 5. test_size : float, optional Proportion of samples per tree to use for the test set, by default 0.2. verbose : bool, optional Verbosity, by default False. return_indices : bool, optional Whether or not to return the train and test indices, by default False. seed : int, optional Random seed, by default None. Returns ------- est : Forest Fitted forest. all_proba_list : list of ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each estimator on their out of bag samples. Length of list is equal to the number of splits. train_idx_list : list of ArrayLike of shape (n_samples,) The training indices for each split. test_idx_list : list of ArrayLike of shape (n_samples,) The testing indices for each split. """ X = X.astype(np.float32) if cv is not None: cv = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed) n_splits = cv.get_n_splits() train_idx_list, test_idx_list = [], [] for train_idx, test_idx in cv.split(X, y): train_idx_list.append(train_idx) test_idx_list.append(test_idx) else: n_samples_idx = np.arange(len(y)) train_idx, test_idx = train_test_split( n_samples_idx, test_size=test_size, random_state=seed, shuffle=True, stratify=y ) train_idx_list = [train_idx] test_idx_list = [test_idx] n_splits = 1 est_list = [] all_proba_list = [] for isplit in range(n_splits): # clone the estimator to remove all fitted attributes est = clone(est) X_train, y_train = X[train_idx_list[isplit], :], y[train_idx_list[isplit]] # X_test = X[test_idx_list[isplit], :] # build forest est.fit(X_train, y_train) # now evaluate X = est._validate_X_predict(X) # if we trained a binning tree, then we should re-bin the data # XXX: this is inefficient and should be improved to be in line with what # the Histogram Gradient Boosting Tree does, where the binning thresholds # are passed into the tree itself, thus allowing us to set the node feature # value thresholds within the tree itself. if est.max_bins is not None: X = est._bin_data(X, is_training_data=False).astype(DTYPE) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs) # avoid storing the output of every estimator by summing them here all_proba = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_parallel_predict_proba)(e.predict_proba, X, test_idx_list[isplit]) for e in est.estimators_ ) posterior_arr = np.full((est.n_estimators, X.shape[0], est.n_classes_), np.nan) for itree, (proba) in enumerate(all_proba): posterior_arr[itree, test_idx_list[isplit], ...] = proba.reshape(-1, est.n_classes_) all_proba_list.append(posterior_arr) est_list.append(est) if return_indices: return est_list, all_proba_list, train_idx_list, test_idx_list return est_list, all_proba_list