import numpy as np
import scipy.linalg
import scipy.special
import scipy.stats
[docs]
def simulate_helix(
radius_a=0,
radius_b=1,
obs_noise_func=None,
nature_noise_func=None,
alpha=0.005,
n_samples=1000,
return_mi_lb=False,
random_seed=None,
):
"""Simulate data from a helix.
Parameters
----------
radius_a : int, optional
The value of the smallest radius, by default 0.0.
radius_b : int, optional
The value of the largest radius, by default 1.0
obs_noise_func : Callable, optional
By default None, which defaults to a Uniform distribution from
(-0.005, 0.005). If passed in, then must be a callable that when
called returns a random number denoting the noise.
nature_noise_func : callable, optional
By defauult None, which will add no noise. The nature noise func
is just an independent noise term added to ``P`` before it is
passed to the generation of the X, Y, and Z terms.
alpha : float, optional
The value of the noise, by default 0.005.
n_samples : int, optional
Number of samples to generate, by default 1000.
return_mi_lb : bool, optional
Whether to return the mutual information lower bound, by default False.
random_seed : int, optional
The random seed.
Returns
-------
P : array-like of shape (n_samples,)
The sampled P.
X : array-like of shape (n_samples,)
The X dimension.
Y : array-like of shape (n_samples,)
The X dimension.
Z : array-like of shape (n_samples,)
The X dimension.
lb : float
The mutual information lower bound.
Notes
-----
Data is generated as follows: We first sample a radius that
defines the helix, :math:`R \\approx Unif(radius_a, radius_b)`.
Afterwards, we generate one sample as follows::
P = 5\\pi + 3\\pi R
X = (P + \\epsilon_1) cos(P + \\epsilon_1) / 8\\pi + N_1
Y = (P + \\epsilon_2) sin(P + \\epsilon_2) / 8\\pi + N_2
Z = (P + \\epsilon_3) / 8\\pi + N_3
where :math:`N_1,N_2,N_3` are noise variables that are independently
sampled for each sample point. And
:math:`\\epsilon_1, \\epsilon_2, \\epsilon_3` are "nature noise" terms
which are off by default. This process is repeated ``n_samples`` times.
Note, that this forms the graphical model::
R \\rightarrow P
P \\rightarrow X
P \\rightarrow Y
P \\rightarrow Z
such that P is a confounder among X, Y and Z. This implies that X, Y and Z
are conditionally dependent on P, whereas
"""
rng = np.random.default_rng(random_seed)
Radii = np.zeros((n_samples,))
P_arr = np.zeros((n_samples,))
X_arr = np.zeros((n_samples,))
Y_arr = np.zeros((n_samples,))
Z_arr = np.zeros((n_samples,))
if obs_noise_func is None:
obs_noise_func = lambda: rng.uniform(-alpha, alpha)
if nature_noise_func is None:
nature_noise_func = lambda: 0.0
for idx in range(n_samples):
Radii[idx] = rng.uniform(radius_a, radius_b)
P_arr[idx] = 5 * np.pi + 3 * np.pi * Radii[idx]
X_arr[idx] = (P_arr[idx] + nature_noise_func()) * np.cos(
P_arr[idx] + nature_noise_func()
) / (8 * np.pi) + obs_noise_func()
Y_arr[idx] = (P_arr[idx] + nature_noise_func()) * np.sin(
P_arr[idx] + nature_noise_func()
) / (8 * np.pi) + obs_noise_func()
Z_arr[idx] = (P_arr[idx] + nature_noise_func()) / (8 * np.pi) + obs_noise_func()
if return_mi_lb:
lb = alpha / 2 - np.log(alpha)
return P_arr, X_arr, Y_arr, Z_arr, lb
return P_arr, X_arr, Y_arr, Z_arr
[docs]
def simulate_sphere(
radius=1, noise_func=None, alpha=0.005, n_samples=1000, return_mi_lb=False, random_seed=None
):
"""Simulate samples generated on a sphere.
Parameters
----------
radius : int, optional
The radius of the sphere, by default 1.
noise_func : callable, optional
The noise function to call to add to samples, by default None,
which defaults to sampling from the uniform distribution [-alpha, alpha].
alpha : float, optional
The value of the noise, by default 0.005.
n_samples : int, optional
Number of samples to generate, by default 1000.
return_mi_lb : bool, optional
Whether to return the mutual information lower bound, by default False.
random_seed : int, optional
Random seed, by default None.
Returns
-------
latitude : float
Latitude.
longitude : float
Longitude.
Y1 : array-like of shape (n_samples,)
The X coordinate.
Y2 : array-like of shape (n_samples,)
The Y coordinate.
Y3 : array-like of shape (n_samples,)
The Z coordinate.
lb : float
The mutual information lower bound.
"""
rng = np.random.default_rng(random_seed)
if noise_func is None:
noise_func = lambda: rng.uniform(-alpha, alpha)
latitude = np.zeros((n_samples,))
longitude = np.zeros((n_samples,))
Y1 = np.zeros((n_samples,))
Y2 = np.zeros((n_samples,))
Y3 = np.zeros((n_samples,))
# sample latitude and longitude
for idx in range(n_samples):
# sample latitude and longitude
latitude[idx] = rng.uniform(0, 2 * np.pi)
longitude[idx] = np.arccos(1 - 2 * rng.uniform(0, 1))
Y1[idx] = np.sin(longitude[idx]) * np.cos(latitude[idx]) * radius + noise_func()
Y2[idx] = np.sin(longitude[idx]) * np.sin(latitude[idx]) * radius + noise_func()
Y3[idx] = np.cos(longitude[idx]) * radius + noise_func()
if return_mi_lb:
lb = alpha / 2 - np.log(alpha)
return latitude, longitude, Y1, Y2, Y3, lb
return latitude, longitude, Y1, Y2, Y3
[docs]
def simulate_multivariate_gaussian(mean=None, cov=None, d=2, n_samples=1000, seed=1234):
"""Multivariate gaussian simulation for testing entropy and MI estimators.
Simulates samples from a "known" multivariate gaussian distribution
and then passes those samples, along with the true analytical MI/CMI.
Parameters
----------
mean : array-like of shape (n_features,)
The optional mean array. If None (default), a random standard normal vector is drawn.
cov : array-like of shape (n_features, n_features)
The covariance array. If None (default), a random standard normal 2D array is drawn.
It is then converted to a PD matrix.
d : int
The dimensionality of the multivariate gaussian. By default 2.
n_samples : int
The number of samples to generate. By default 1000.
seed : int
The random seed to feed to :func:`numpy.random.default_rng`.
Returns
-------
data : array-like of shape (n_samples, n_features)
The generated data from the distribution.
mean : array-like of shape (n_features,)
The mean vector of the distribution.
cov : array-like of shape (n_features, n_features)
The covariance matrix of the distribution.
"""
rng = np.random.default_rng(seed)
if mean is None:
mean = rng.normal(size=(d,))
if cov is None:
# generate random covariance matrix and enure it is symmetric and positive-definite
cov = rng.normal(size=(d, d))
cov = 0.5 * (cov.dot(cov.T))
if not np.all(np.linalg.eigvals(cov) > 0):
raise RuntimeError("Passed in covariance matrix should be positive definite")
if not scipy.linalg.issymmetric(cov):
raise RuntimeError(f"Passed in covariance matrix {cov} should be symmetric")
if len(mean) != d or len(cov) != d:
raise RuntimeError(f"Dimensionality of mean and covariance matrix should match {d}")
data = rng.multivariate_normal(mean=mean, cov=cov, size=(n_samples))
return data, mean, cov