import numpy as np
from scipy.stats import multivariate_normal, entropy
from scipy.integrate import nquad
[docs]
def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
"""Simulate classification data from a quadratic model.
This is a form of the simulation used in :footcite:`panda2018learning`.
Parameters
----------
n_samples : int
The number of samples to generate.
n_features : int
The number of dimensions in the dataset.
noise : bool, optional
Whether or not to add noise, by default False.
seed : int, optional
Random seed, by default None.
Returns
-------
x : array-like, shape (2 * n_samples, n_features)
Data array.
v : array-like, shape (2 * n_samples,)
Target array of 1's and 0's.
References
----------
.. footbibliography::
"""
rng = np.random.default_rng(seed)
x = rng.standard_normal(size=(n_samples, n_features))
coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(n_features)])
eps = rng.standard_normal(size=(n_samples, n_features))
x_coeffs = x * coeffs
y = x_coeffs**2 + noise * eps
# generate the classification labels
n1 = x.shape[0]
n2 = y.shape[0]
v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
x = np.vstack((x, y))
return x, v
[docs]
def make_trunk_classification(
n_samples,
n_dim=10,
n_informative=10,
m_factor: int = -1,
rho: int = 0,
band_type: str = "ma",
return_params: bool = False,
mix: int = 0,
seed=None,
):
"""Generate trunk dataset.
For each dimension in the first distribution, there is a mean of :math:`1 / d`, where
``d`` is the dimensionality. The covariance is the identity matrix.
The second distribution has a mean vector that is the negative of the first.
As ``d`` increases, the two distributions become closer and closer.
See full details in :footcite:`trunk1982`.
Instead of the identity covariance matrix, one can implement a banded covariance matrix
that follows :footcite:`Bickel_2008`.
Parameters
----------
n_samples : int
Number of sample to generate.
n_dim : int, optional
The dimensionality of the dataset and the number of
unique labels, by default 10.
n_informative : int, optional
The informative dimensions. All others for ``n_dim - n_informative``
are uniform noise.
m_factor : int, optional
The multiplicative factor to apply to the mean-vector of the first
distribution to obtain the mean-vector of the second distribution.
By default -1.
rho : float, optional
The covariance value of the bands. By default 0 indicating, an identity matrix is used.
band_type : str
The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`.
Either 'ma', or 'ar'.
return_params : bool, optional
Whether or not to return the distribution parameters of the classes normal distributions.
mix : float, optional
Whether or not to mix the Gaussians. Should be a value between 0 and 1.
seed : int, optional
Random seed, by default None.
Returns
-------
X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64
Trunk dataset as a dense array.
y : np.ndarray of shape (n_samples,), dtype=np.intp
Labels of the dataset.
means : list of ArrayLike of shape (n_dim,), dtype=np.float64
The mean vector for each class starting with class 0.
Returned if ``return_params`` is True.
covs : list of ArrayLike of shape (n_dim, n_dim), dtype=np.float64
The covariance for each class. Returned if ``return_params`` is True.
References
----------
.. footbibliography::
"""
if n_dim < n_informative:
raise ValueError(
f"Number of informative dimensions {n_informative} must be less than number "
f"of dimensions, {n_dim}"
)
rng = np.random.default_rng(seed=seed)
mu_1 = np.array([1 / np.sqrt(i) for i in range(1, n_informative + 1)])
mu_0 = m_factor * mu_1
if rho != 0:
if band_type == "ma":
cov = _moving_avg_cov(n_informative, rho)
elif band_type == "ar":
cov = _autoregressive_cov(n_informative, rho)
else:
raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".')
else:
cov = np.identity(n_informative)
if mix < 0 or mix > 1:
raise ValueError("Mix must be between 0 and 1.")
if n_informative > 1000:
method = "cholesky"
else:
method = "svd"
if mix == 0:
X = np.vstack(
(
rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method),
rng.multivariate_normal(mu_1, cov, n_samples // 2, method=method),
)
)
else:
mixture_idx = rng.choice(
[0, 1], n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
)
X_mixture = np.zeros((n_samples // 2, len(mu_1)))
for idx in range(n_samples // 2):
if mixture_idx[idx] == 1:
X_sample = rng.multivariate_normal(mu_1, cov, 1, method=method)
else:
X_sample = rng.multivariate_normal(mu_0, cov, 1, method=method)
X_mixture[idx, :] = X_sample
X = np.vstack(
(
rng.multivariate_normal(
np.zeros(n_informative), cov, n_samples // 2, method=method
),
X_mixture,
)
)
if n_dim > n_informative:
X = np.hstack((X, rng.uniform(low=0, high=1, size=(n_samples, n_dim - n_informative))))
y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))
if return_params:
return X, y, [mu_0, mu_1], [cov, cov]
return X, y
def _moving_avg_cov(n_dim, rho):
# Create a meshgrid of indices
i, j = np.meshgrid(np.arange(1, n_dim + 1), np.arange(1, n_dim + 1), indexing="ij")
# Calculate the covariance matrix using the corrected formula
cov_matrix = rho ** np.abs(i - j)
# Apply the banding condition
cov_matrix[abs(i - j) > 1] = 0
return cov_matrix
def _autoregressive_cov(n_dim, rho):
# Create a meshgrid of indices
i, j = np.meshgrid(np.arange(1, n_dim + 1), np.arange(1, n_dim + 1), indexing="ij")
# Calculate the covariance matrix using the corrected formula
cov_matrix = rho ** np.abs(i - j)
return cov_matrix
def _compute_mi_bounds(means, covs, class_probs):
# compute bounds using https://arxiv.org/pdf/2101.11670.pdf
prior_y = np.array(class_probs)
H_Y = entropy(prior_y, base=np.exp(1))
# number of mixtures
N = len(class_probs)
cov = covs[0]
H_YX_lb = 0.0
H_YX_ub = 0.0
for idx in range(N):
num = 0.0
denom = 0.0
mean_i = means[idx]
for jdx in range(N):
mean_j = means[jdx]
c_alpha, kl_div = _compute_c_alpha_and_kl(mean_i, mean_j, cov, alpha=0.5)
num += class_probs[jdx] * np.exp(-c_alpha)
for kdx in range(N):
mean_k = means[kdx]
c_alpha, kl_div = _compute_c_alpha_and_kl(mean_i, mean_k, cov, alpha=0.5)
denom += class_probs[jdx] * np.exp(-kl_div)
H_YX_lb += class_probs[idx] * np.log(num / denom)
H_YX_ub += class_probs[idx] * np.log(denom / num)
I_lb = H_Y - H_YX_lb
I_ub = H_Y - H_YX_ub
return I_lb, I_ub
def _compute_c_alpha_and_kl(mean_i, mean_j, cov, alpha=0.5):
mean_ij = mean_i - mean_j
lambda_ij = mean_ij.T @ np.linalg.inv(cov) @ mean_ij
return alpha * (1.0 - alpha) * lambda_ij / 2.0, lambda_ij / 2