Source code for sktree.datasets.hyppo

import warnings

import numpy as np
from scipy.integrate import nquad
from scipy.stats import entropy, multivariate_normal


[docs] def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None): """Simulate classification data from a quadratic model. This is a form of the simulation used in :footcite:`panda2018learning`. Parameters ---------- n_samples : int The number of samples to generate. n_features : int The number of dimensions in the dataset. noise : bool, optional Whether or not to add noise, by default False. seed : int, optional Random seed, by default None. Returns ------- x : array-like, shape (2 * n_samples, n_features) Data array. v : array-like, shape (2 * n_samples,) Target array of 1's and 0's. References ---------- .. footbibliography:: """ rng = np.random.default_rng(seed) x = rng.standard_normal(size=(n_samples, n_features)) coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(n_features)]) eps = rng.standard_normal(size=(n_samples, n_features)) x_coeffs = x * coeffs y = x_coeffs**2 + noise * eps # generate the classification labels n1 = x.shape[0] n2 = y.shape[0] v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))]) x = np.vstack((x, y)) return x, v
# Dictionary of simulations from Marron and Wand 1992 # keys: names of each simulation corresponding to the class MarronWandSims # values: probabilities associated with the mixture of Gaussians MARRON_WAND_SIMS = { "gaussian": [1], "skewed_unimodal": [1 / 5, 1 / 5, 3 / 5], "strongly_skewed": [1 / 8] * 8, "kurtotic_unimodal": [2 / 3, 1 / 3], "outlier": [1 / 10, 9 / 10], "bimodal": [1 / 2] * 2, "separated_bimodal": [1 / 2] * 2, "skewed_bimodal": [3 / 4, 1 / 4], "trimodal": [9 / 20, 9 / 20, 1 / 10], "claw": [1 / 2, *[1 / 10] * 5], "double_claw": [49 / 100, 49 / 100, *[1 / 350] * 7], "asymmetric_claw": [1 / 2, *[2 ** (1 - i) / 31 for i in range(-2, 3)]], "asymmetric_double_claw": [*[46 / 100] * 2, *[1 / 300] * 3, *[7 / 300] * 3], "smooth_comb": [2 ** (5 - i) / 63 for i in range(6)], "discrete_comb": [*[2 / 7] * 3, *[1 / 21] * 3], }
[docs] def make_marron_wand_classification( n_samples, n_dim=4096, n_informative=256, simulation: str = "gaussian", rho: int = 0, band_type: str = "ma", return_params: bool = False, scaling_factor: float = 1.0, seed=None, ): """Generate Marron-Wand binary classification dataset. The simulation is similar to that of :func:`sktree.datasets.make_trunk_classification` where the first class is generated from a multivariate-Gaussians with mean vector of 0's. The second class is generated from a mixture of Gaussians with mean vectors specified by the Marron-Wand simulations, but as the dimensionality increases, the second class distribution approaches the first class distribution by a factor of :math:`1 / sqrt(d)`. Full details for the Marron-Wand simulations can be found in :footcite:`marron1992exact`. Instead of the identity covariance matrix, one can implement a banded covariance matrix that follows :footcite:`Bickel_2008`. Parameters ---------- n_samples : int Number of sample to generate. Must be an even number, else the total number of samples generated will be ``n_samples - 1``. n_dim : int, optional The dimensionality of the dataset and the number of unique labels, by default 4096. n_informative : int, optional The informative dimensions. All others for ``n_dim - n_informative`` are Gaussian noise. Default is 256. simulation : str, optional Which simulation to run. Must be one of the following Marron-Wand simulations: 'gaussian', 'skewed_unimodal', 'strongly_skewed', 'kurtotic_unimodal', 'outlier', 'bimodal', 'separated_bimodal', 'skewed_bimodal', 'trimodal', 'claw', 'double_claw', 'asymmetric_claw', 'asymmetric_double_claw', 'smooth_comb', 'discrete_comb'. When calling the Marron-Wand simulations, only the covariance parameters are considered (`rho` and `band_type`). Means are taken from :footcite:`marron1992exact`. By default 'gaussian'. rho : float, optional The covariance value of the bands. By default 0 indicating, an identity matrix is used. band_type : str The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. Either 'ma', or 'ar'. return_params : bool, optional Whether or not to return the distribution parameters of the classes normal distributions. scaling_factor : float, optional The scaling factor for the covariance matrix. By default 1. seed : int, optional Random seed, by default None. Returns ------- X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 Trunk dataset as a dense array. y : np.ndarray of shape (n_samples,), dtype=np.intp Labels of the dataset. G : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 The mixture of Gaussians for the Marron-Wand simulations. Returned if ``return_params`` is True. w : np.ndarray of shape (n_dim,), dtype=np.float64 The weight vector for the Marron-Wand simulations. Returned if ``return_params`` is True. Notes ----- **Marron-Wand Simulations**: The Marron-Wand simulations generate two classes of data with the setup specified in the paper. Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going from one dimension to the next. For each dimension in the first distribution, there is a mean of :math:`1 / d`, where ``d`` is the dimensionality. The covariance is the identity matrix. The second distribution has a mean vector that is the negative of the first. As ``d`` increases, the two distributions become closer and closer. Full details for the trunk simulation can be found in :footcite:`trunk1982`. References ---------- .. footbibliography:: """ if n_dim < n_informative: n_informative = n_dim warnings.warn( "Number of informative dimensions {n_informative} must be less than number " f"of dimensions, {n_dim}. Setting n_informative to n_dim.", RuntimeWarning, ) if simulation not in MARRON_WAND_SIMS.keys(): raise ValueError( f"Simulation must be: trunk, trunk_overlap, trunk_mix, {MARRON_WAND_SIMS.keys()}" ) rng = np.random.default_rng(seed=seed) if rho != 0: if band_type == "ma": cov = _moving_avg_cov(n_informative, rho) elif band_type == "ar": cov = _autoregressive_cov(n_informative, rho) else: raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".') else: cov = np.identity(n_informative) # allow arbitrary uniform scaling of the covariance matrix cov = scaling_factor * cov # speed up computations for large multivariate normal matrix with SVD approximation if n_informative > 1000: mvg_sampling_method = "cholesky" else: mvg_sampling_method = "svd" mixture_idx = rng.choice( len(MARRON_WAND_SIMS[simulation]), # type: ignore size=n_samples // 2, replace=True, p=MARRON_WAND_SIMS[simulation], ) # the parameters used for each Gaussian in the mixture for each Marron Wand simulation norm_params = MarronWandSims(n_dim=n_informative, cov=cov)(simulation) G = np.fromiter( ( rng_children.multivariate_normal(*(norm_params[i]), size=1, method=mvg_sampling_method) for i, rng_children in zip(mixture_idx, rng.spawn(n_samples // 2)) ), dtype=np.dtype((float, n_informative)), ) # as the dimensionality of the simulations increasing, we are adding more and # more noise to the data using the w parameter w_vec = np.array([1.0 / np.sqrt(i) for i in range(1, n_informative + 1)]) # create new generator instance to ensure reproducibility with multiple runs with the same seed rng_F = np.random.default_rng(seed=seed).spawn(2) X = np.vstack( ( rng_F[0].multivariate_normal( np.zeros(n_informative), cov, n_samples // 2, method=mvg_sampling_method ), (1 - w_vec) * rng_F[1].multivariate_normal( np.zeros(n_informative), cov, n_samples // 2, method=mvg_sampling_method ) + w_vec * G.reshape(n_samples // 2, n_informative), ) ) if n_dim > n_informative: # create new generator instance to ensure reproducibility with multiple runs with the same seed rng_noise = np.random.default_rng(seed=seed) X = np.hstack( ( X, np.hstack( [ rng_children.normal(loc=0, scale=1, size=(X.shape[0], 1)) for rng_children in rng_noise.spawn(n_dim - n_informative) ] ), ) ) y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2))) if return_params: return [X, y, *list(zip(*norm_params)), G, w_vec] return X, y
[docs] def make_trunk_mixture_classification( n_samples, n_dim=4096, n_informative=256, mu_0: int = 0, mu_1: int = 1, rho: int = 0, band_type: str = "ma", return_params: bool = False, mix: float = 0.5, scaling_factor: float = 1.0, seed=None, ): """Generate trunk mixture binary classification dataset. The first class is generated from a multivariate-Gaussians with mean vector of 0's. The second class is generated from a mixture of Gaussians with mean vectors specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, which is the probability of the first Gaussian in the mixture. For each dimension in the first distribution, there is a mean of :math:`1 / d`, where ``d`` is the dimensionality. The covariance is the identity matrix. The second distribution has a mean vector that is the negative of the first. As ``d`` increases, the two distributions become closer and closer. Full details for the trunk simulation can be found in :footcite:`trunk1982`. Instead of the identity covariance matrix, one can implement a banded covariance matrix that follows :footcite:`Bickel_2008`. Parameters ---------- n_samples : int Number of sample to generate. Must be an even number, else the total number of samples generated will be ``n_samples - 1``. n_dim : int, optional The dimensionality of the dataset and the number of unique labels, by default 4096. n_informative : int, optional The informative dimensions. All others for ``n_dim - n_informative`` are Gaussian noise. Default is 256. mu_0 : int, optional The mean of the first distribution. By default -1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. mu_1 : int, optional The mean of the second distribution. By default 1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. rho : float, optional The covariance value of the bands. By default 0 indicating, an identity matrix is used. band_type : str The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. Either 'ma', or 'ar'. return_params : bool, optional Whether or not to return the distribution parameters of the classes normal distributions. mix : int, optional The probabilities associated with the mixture of Gaussians in the ``trunk-mix`` simulation. By default 0.5. scaling_factor : float, optional The scaling factor for the covariance matrix. By default 1. seed : int, optional Random seed, by default None. Returns ------- X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 Trunk dataset as a dense array. y : np.ndarray of shape (n_samples,), dtype=np.intp Labels of the dataset. means : list of ArrayLike of shape (n_dim,), dtype=np.float64 The mean vector for each class starting with class 0. Returned if ``return_params`` is True. covs : list of ArrayLike of shape (n_dim, n_dim), dtype=np.float64 The covariance for each class. Returned if ``return_params`` is True. X_mixture : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 The mixture of Gaussians. Returned if ``return_params`` is True. Notes ----- **Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality increases. This is implemented by decreasing the mean of the distribution by a factor of ``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1, for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on. **Trunk Mix**: The trunk mix simulation generates two classes of data with the same covariance matrix. The first class (label 0) is generated from a multivariate-Gaussians with mean vector of zeros and the second class is generated from a mixture of Gaussians with mean vectors specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, which is the probability of the first Gaussian in the mixture. Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going from one dimension to the next. References ---------- .. footbibliography:: """ if n_dim < n_informative: n_informative = n_dim warnings.warn( "Number of informative dimensions {n_informative} must be less than number " f"of dimensions, {n_dim}. Setting n_informative to n_dim.", RuntimeWarning, ) if mix < 0 or mix > 1: # type: ignore raise ValueError("Mix must be between 0 and 1.") rng = np.random.default_rng(seed=seed) mu_1_vec = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)]) mu_0_vec = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)]) if rho != 0: if band_type == "ma": cov = _moving_avg_cov(n_informative, rho) elif band_type == "ar": cov = _autoregressive_cov(n_informative, rho) else: raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".') else: cov = np.identity(n_informative) # Note: When variance is 1, trunk-mix does not look bimodal at low dimensions. # It is set it to (2/3)**2 since that is consistent with Marron and Wand bimodal cov = scaling_factor * cov # speed up computations for large multivariate normal matrix with SVD approximation if n_informative > 1000: method = "cholesky" else: method = "svd" mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]) # type: ignore norm_params = [[mu_0_vec, cov], [mu_1_vec, cov]] dim_sample = len(norm_params[0][0]) # Dimensionality of the samples X_mixture = np.empty((n_samples // 2, dim_sample)) # Pre-allocate array for samples for idx, (i, rng_child) in enumerate(zip(mixture_idx, rng.spawn(n_samples // 2))): mean, cov = norm_params[i] X_mixture[idx, :] = rng_child.multivariate_normal(mean, cov, size=1, method=method) # create new generator instance to ensure reproducibility with multiple runs with the same seed rng_F = np.random.default_rng(seed=seed) X = np.vstack( ( rng_F.multivariate_normal(np.zeros(n_informative), cov, n_samples // 2, method=method), X_mixture.reshape(n_samples // 2, n_informative), ) ) if n_dim > n_informative: # create new generator instance to ensure reproducibility with multiple runs with the same seed rng_noise = np.random.default_rng(seed=seed) X = np.hstack( ( X, np.hstack( [ rng_children.normal(loc=0, scale=1, size=(X.shape[0], 1)) for rng_children in rng_noise.spawn(n_dim - n_informative) ] ), ) ) y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2))) if return_params: return [X, y, *list(zip(*norm_params)), X_mixture] return X, y
[docs] def make_trunk_classification( n_samples, n_dim=4096, n_informative=256, mu_0: int = 0, mu_1: int = 1, rho: int = 0, band_type: str = "ma", return_params: bool = False, scaling_factor: float = 1.0, seed=None, ): """Generate trunk binary classification dataset. For each dimension in the first distribution, there is a mean of :math:`1 / d`, where ``d`` is the dimensionality. The covariance is the identity matrix. The second distribution has a mean vector that is the negative of the first. As ``d`` increases, the two distributions become closer and closer. Full details for the trunk simulation can be found in :footcite:`trunk1982`. Instead of the identity covariance matrix, one can implement a banded covariance matrix that follows :footcite:`Bickel_2008`. Parameters ---------- n_samples : int Number of sample to generate. Must be an even number, else the total number of samples generated will be ``n_samples - 1``. n_dim : int, optional The dimensionality of the dataset and the number of unique labels, by default 4096. n_informative : int, optional The informative dimensions. All others for ``n_dim - n_informative`` are Gaussian noise. Default is 256. mu_0 : int, optional The mean of the first distribution. By default -1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. mu_1 : int, optional The mean of the second distribution. By default 1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. rho : float, optional The covariance value of the bands. By default 0 indicating, an identity matrix is used. band_type : str The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. Either 'ma', or 'ar'. return_params : bool, optional Whether or not to return the distribution parameters of the classes normal distributions. Default false. scaling_factor : float, optional The scaling factor for the covariance matrix. By default 1. seed : int, optional Random seed, by default None. Returns ------- X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 Trunk dataset as a dense array. y : np.ndarray of shape (n_samples,), dtype=np.intp Labels of the dataset. means : list of ArrayLike of shape (n_dim,), dtype=np.float64 The mean vector for each class starting with class 0. Returned if ``return_params`` is True. covs : list of ArrayLike of shape (n_dim, n_dim), dtype=np.float64 The covariance for each class. Returned if ``return_params`` is True. Notes ----- **Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality increases. This is implemented by decreasing the mean of the distribution by a factor of ``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1, for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on. **Trunk Overlap**: The trunk overlap simulation generates two classes of data with the same covariance matrix and mean vector of zeros. Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going from one dimension to the next. References ---------- .. footbibliography:: """ if n_dim < n_informative: n_informative = n_dim warnings.warn( "Number of informative dimensions {n_informative} must be less than number " f"of dimensions, {n_dim}. Setting n_informative to n_dim.", RuntimeWarning, ) rng = np.random.default_rng(seed=seed) mu_1_vec = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)]) mu_0_vec = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)]) if rho != 0: if band_type == "ma": cov = _moving_avg_cov(n_informative, rho) elif band_type == "ar": cov = _autoregressive_cov(n_informative, rho) else: raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".') else: cov = np.identity(n_informative) # allow arbitrary uniform scaling of the covariance matrix cov = scaling_factor * cov # speed up computations for large multivariate normal matrix with SVD approximation if n_informative > 1000: method = "cholesky" else: method = "svd" X = np.vstack( [ rng_children.multivariate_normal(mu_vec, cov, n_samples // 2, method=method) for rng_children, mu_vec in zip(rng.spawn(2), [mu_0_vec, mu_1_vec]) ] ) if n_dim > n_informative: X = np.hstack( ( X, np.hstack( [ rng_children.normal(loc=0, scale=1, size=(X.shape[0], 1)) for rng_children in rng.spawn(n_dim - n_informative) ] ), ) ) y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2))) if return_params: return [X, y, [mu_0_vec, mu_1_vec], [cov, cov]] return X, y
class MarronWandSims: def __init__(self, n_dim=1, cov=1): self.n_dim = n_dim self.cov = cov def __call__(self, simulation): sims = self._my_method_generator() if simulation in sims.keys(): return sims[simulation]() else: raise ValueError(f"simulation is not one of these: {sims.keys()}") def _my_method_generator(self): return { method: getattr(self, method) for method in dir(self) if not method.startswith("__") } def gaussian(self): return [[np.zeros(self.n_dim), self.cov]] def skewed_unimodal(self): return [ [np.zeros(self.n_dim), self.cov], [np.full(self.n_dim, 1 / 2), self.cov * (2 / 3) ** 2], [np.full(self.n_dim, 13 / 12), self.cov * (5 / 9) ** 2], ] def strongly_skewed(self): return [ [np.full(self.n_dim, 3 * ((2 / 3) ** l_mix - 1)), self.cov * (2 / 3) ** (2 * l_mix)] for l_mix in range(8) ] def kurtotic_unimodal(self): return [[np.zeros(self.n_dim), self.cov], [np.zeros(self.n_dim), self.cov * (1 / 10) ** 2]] def outlier(self): return [[np.zeros(self.n_dim), self.cov], [np.zeros(self.n_dim), self.cov * (1 / 10) ** 2]] def bimodal(self): return [ [-np.ones(self.n_dim), self.cov * (2 / 3) ** 2], [np.ones(self.n_dim), self.cov * (2 / 3) ** 2], ] def separated_bimodal(self): return [ [-np.full(self.n_dim, 3 / 2), self.cov * (1 / 2) ** 2], [np.full(self.n_dim, 3 / 2), self.cov * (1 / 2) ** 2], ] def skewed_bimodal(self): return [ [np.zeros(self.n_dim), self.cov], [np.full(self.n_dim, 3 / 2), self.cov * (1 / 3) ** 2], ] def trimodal(self): return [ [np.full(self.n_dim, -6 / 5), self.cov * (3 / 5) ** 2], [np.full(self.n_dim, 6 / 5), self.cov * (3 / 5) ** 2], [np.zeros(self.n_dim), self.cov * (1 / 4) ** 2], ] def claw(self): return [ [np.zeros(self.n_dim), self.cov], *[ [np.full(self.n_dim, (l_mix / 2) - 1), self.cov * (1 / 10) ** 2] for l_mix in range(5) ], ] def double_claw(self): return [ [-np.ones(self.n_dim), self.cov * (2 / 3) ** 2], [np.ones(self.n_dim), self.cov * (2 / 3) ** 2], *[ [np.full(self.n_dim, (l_mix - 3) / 2), self.cov * (1 / 100) ** 2] for l_mix in range(7) ], ] def asymmetric_claw(self): return [ [np.zeros(self.n_dim), self.cov], *[ [np.full(self.n_dim, l_mix + 1 / 2), self.cov * (1 / ((2**l_mix) * 10)) ** 2] for l_mix in range(-2, 3) ], ] def asymmetric_double_claw(self): return [ *[[np.full(self.n_dim, 2 * l_mix - 1), self.cov * (2 / 3) ** 2] for l_mix in range(2)], *[ [-np.full(self.n_dim, l_mix / 2), self.cov * (1 / 100) ** 2] for l_mix in range(1, 4) ], *[[np.full(self.n_dim, l_mix / 2), self.cov * (7 / 100) ** 2] for l_mix in range(1, 4)], ] def smooth_comb(self): return [ [ np.full(self.n_dim, (65 - 96 * ((1 / 2) ** l_mix)) / 21), self.cov * (32 / 63) ** 2 / (2 ** (2 * l_mix)), ] for l_mix in range(6) ] def discrete_comb(self): return [ *[ [np.full(self.n_dim, (12 * l_mix - 15) / 7), self.cov * (2 / 7) ** 2] for l_mix in range(3) ], *[ [np.full(self.n_dim, (2 * l_mix) / 7), self.cov * (1 / 21) ** 2] for l_mix in range(8, 11) ], ] def _moving_avg_cov(n_dim, rho): # Create a meshgrid of indices i, j = np.meshgrid(np.arange(1, n_dim + 1), np.arange(1, n_dim + 1), indexing="ij") # Calculate the covariance matrix using the corrected formula cov_matrix = rho ** np.abs(i - j) # Apply the banding condition cov_matrix[abs(i - j) > 1] = 0 return cov_matrix def _autoregressive_cov(n_dim, rho): # Create a meshgrid of indices i, j = np.meshgrid(np.arange(1, n_dim + 1), np.arange(1, n_dim + 1), indexing="ij") # Calculate the covariance matrix using the corrected formula cov_matrix = rho ** np.abs(i - j) return cov_matrix
[docs] def approximate_clf_mutual_information( means, covs, class_probs=[0.5, 0.5], base=np.exp(1), seed=None ): """Approximate MI for multivariate Gaussian for a classification setting. Parameters ---------- means : list of ArrayLike of shape (n_dim,) A list of means to sample from for each class. covs : list of ArrayLike of shape (n_dim, n_dim) A list of covariances to sample from for each class. class_probs : list, optional List of class probabilities, by default [0.5, 0.5] for balanced binary classification. base : float, optional The bit base to use, by default np.exp(1) for natural logarithm. seed : int, optional Random seed for the multivariate normal, by default None. Returns ------- I_XY : float Estimated mutual information. H_X : float Estimated entropy of X, the mixture of multivariate Gaussians. H_XY : float The conditional entropy of X given Y. int_err : float The integration error for ``H_X``. """ # this implicitly assumes that the signal of interest is between -10 and 10 scale = 10 n_dims = [cov.shape[1] for cov in covs] lims = [[-scale, scale]] * max(n_dims) # Compute entropy and X and Y. def func(*args): x = np.array(args) p = 0 for k in range(len(means)): p += class_probs[k] * multivariate_normal.pdf(x, means[k], covs[k]) return -p * np.log(p) / np.log(base) # numerically integrate H(X) # opts = dict(limit=1000) H_X, int_err = nquad(func, lims) # Compute MI. H_XY = 0 for k in range(len(means)): H_XY += ( class_probs[k] * (n_dims[k] * np.log(2 * np.pi) + np.log(np.linalg.det(covs[k])) + n_dims[k]) / (2 * np.log(base)) ) I_XY = H_X - H_XY return I_XY, H_X, H_XY, int_err
[docs] def approximate_clf_mutual_information_with_monte_carlo( means, covs, n_samples=100_000, class_probs=[0.5, 0.5], base=np.exp(1), seed=None ): """Approximate MI for multivariate Gaussian for a classification setting. Parameters ---------- means : list of ArrayLike of shape (n_dim,) A list of means to sample from for each class. covs : list of ArrayLike of shape (n_dim, n_dim) A list of covariances to sample from for each class. n_samples : int The total number of simulation samples class_probs : list, optional List of class probabilities, by default [0.5, 0.5] for balanced binary classification. base : float, optional The bit base to use, by default np.exp(1) for natural logarithm. seed : int, optional Random seed for the multivariate normal, by default None. Returns ------- I_XY : float Estimated mutual information. H_Y : float Estimated entropy of Y, the class labels. H_Y_on_X : float The conditional entropy of Y given X. """ rng = np.random.default_rng(seed=seed) P_Y = class_probs # Generate samples pdf_class = [] X = [] for i in range(len(means)): pdf_class.append(multivariate_normal(means[i], covs[i], allow_singular=True)) X.append( rng.multivariate_normal(means[i], covs[i], size=int(n_samples * P_Y[i])).reshape(-1, 1) ) X = np.vstack(X) # Calculate P(X) by law of total probability P_X_l = [] P_X_on_Y = [] for i in range(len(means)): P_X_on_Y.append(pdf_class[i].pdf(X)) P_X_l.append(P_X_on_Y[-1] * P_Y[i]) P_X = sum(P_X_l) # Calculate P(Y|X) by Bayes' theorem P_Y_on_X = [] for i in range(len(means)): P_Y_on_X.append((P_X_on_Y[i] * P_Y[i] / P_X).reshape(-1, 1)) P_Y_on_X = np.hstack(P_Y_on_X) P_Y_on_X = P_Y_on_X[~np.isnan(P_Y_on_X)].reshape(-1, 2) # Calculate the entropy of Y by class counts H_Y = entropy(P_Y, base=base) # Calculate the conditional entropy of Y on X H_Y_on_X = np.mean(entropy(P_Y_on_X, base=base, axis=1)) MI = H_Y - H_Y_on_X return MI, H_Y, H_Y_on_X
def _compute_mi_bounds(means, covs, class_probs): # compute bounds using https://arxiv.org/pdf/2101.11670.pdf prior_y = np.array(class_probs) H_Y = entropy(prior_y, base=np.exp(1)) # number of mixtures N = len(class_probs) cov = covs[0] H_YX_lb = 0.0 H_YX_ub = 0.0 for idx in range(N): num = 0.0 denom = 0.0 mean_i = means[idx] for jdx in range(N): mean_j = means[jdx] c_alpha, kl_div = _compute_c_alpha_and_kl(mean_i, mean_j, cov, alpha=0.5) num += class_probs[jdx] * np.exp(-c_alpha) for kdx in range(N): mean_k = means[kdx] c_alpha, kl_div = _compute_c_alpha_and_kl(mean_i, mean_k, cov, alpha=0.5) denom += class_probs[jdx] * np.exp(-kl_div) H_YX_lb += class_probs[idx] * np.log(num / denom) H_YX_ub += class_probs[idx] * np.log(denom / num) I_lb = H_Y - H_YX_lb I_ub = H_Y - H_YX_ub return I_lb, I_ub def _compute_c_alpha_and_kl(mean_i, mean_j, cov, alpha=0.5): mean_ij = mean_i - mean_j lambda_ij = mean_ij.T @ np.linalg.inv(cov) @ mean_ij return alpha * (1.0 - alpha) * lambda_ij / 2.0, lambda_ij / 2