Source code for sktree.experimental.simulate

import numpy as np
import scipy.linalg
import scipy.special
import scipy.stats


[docs]def simulate_helix( radius_a=0, radius_b=1, obs_noise_func=None, nature_noise_func=None, alpha=0.005, n_samples=1000, return_mi_lb=False, random_seed=None, ): """Simulate data from a helix. Parameters ---------- radius_a : int, optional The value of the smallest radius, by default 0.0. radius_b : int, optional The value of the largest radius, by default 1.0 obs_noise_func : Callable, optional By default None, which defaults to a Uniform distribution from (-0.005, 0.005). If passed in, then must be a callable that when called returns a random number denoting the noise. nature_noise_func : callable, optional By defauult None, which will add no noise. The nature noise func is just an independent noise term added to ``P`` before it is passed to the generation of the X, Y, and Z terms. alpha : float, optional The value of the noise, by default 0.005. n_samples : int, optional Number of samples to generate, by default 1000. return_mi_lb : bool, optional Whether to return the mutual information lower bound, by default False. random_seed : int, optional The random seed. Returns ------- P : array-like of shape (n_samples,) The sampled P. X : array-like of shape (n_samples,) The X dimension. Y : array-like of shape (n_samples,) The X dimension. Z : array-like of shape (n_samples,) The X dimension. lb : float The mutual information lower bound. Notes ----- Data is generated as follows: We first sample a radius that defines the helix, :math:`R \\approx Unif(radius_a, radius_b)`. Afterwards, we generate one sample as follows:: P = 5\\pi + 3\\pi R X = (P + \\epsilon_1) cos(P + \\epsilon_1) / 8\\pi + N_1 Y = (P + \\epsilon_2) sin(P + \\epsilon_2) / 8\\pi + N_2 Z = (P + \\epsilon_3) / 8\\pi + N_3 where :math:`N_1,N_2,N_3` are noise variables that are independently sampled for each sample point. And :math:`\\epsilon_1, \\epsilon_2, \\epsilon_3` are "nature noise" terms which are off by default. This process is repeated ``n_samples`` times. Note, that this forms the graphical model:: R \\rightarrow P P \\rightarrow X P \\rightarrow Y P \\rightarrow Z such that P is a confounder among X, Y and Z. This implies that X, Y and Z are conditionally dependent on P, whereas """ rng = np.random.default_rng(random_seed) Radii = np.zeros((n_samples,)) P_arr = np.zeros((n_samples,)) X_arr = np.zeros((n_samples,)) Y_arr = np.zeros((n_samples,)) Z_arr = np.zeros((n_samples,)) if obs_noise_func is None: obs_noise_func = lambda: rng.uniform(-alpha, alpha) if nature_noise_func is None: nature_noise_func = lambda: 0.0 for idx in range(n_samples): Radii[idx] = rng.uniform(radius_a, radius_b) P_arr[idx] = 5 * np.pi + 3 * np.pi * Radii[idx] X_arr[idx] = (P_arr[idx] + nature_noise_func()) * np.cos( P_arr[idx] + nature_noise_func() ) / (8 * np.pi) + obs_noise_func() Y_arr[idx] = (P_arr[idx] + nature_noise_func()) * np.sin( P_arr[idx] + nature_noise_func() ) / (8 * np.pi) + obs_noise_func() Z_arr[idx] = (P_arr[idx] + nature_noise_func()) / (8 * np.pi) + obs_noise_func() if return_mi_lb: lb = alpha / 2 - np.log(alpha) return P_arr, X_arr, Y_arr, Z_arr, lb return P_arr, X_arr, Y_arr, Z_arr
[docs]def simulate_sphere( radius=1, noise_func=None, alpha=0.005, n_samples=1000, return_mi_lb=False, random_seed=None ): """Simulate samples generated on a sphere. Parameters ---------- radius : int, optional The radius of the sphere, by default 1. noise_func : callable, optional The noise function to call to add to samples, by default None, which defaults to sampling from the uniform distribution [-alpha, alpha]. alpha : float, optional The value of the noise, by default 0.005. n_samples : int, optional Number of samples to generate, by default 1000. return_mi_lb : bool, optional Whether to return the mutual information lower bound, by default False. random_seed : int, optional Random seed, by default None. Returns ------- latitude : float Latitude. longitude : float Longitude. Y1 : array-like of shape (n_samples,) The X coordinate. Y2 : array-like of shape (n_samples,) The Y coordinate. Y3 : array-like of shape (n_samples,) The Z coordinate. lb : float The mutual information lower bound. """ rng = np.random.default_rng(random_seed) if noise_func is None: noise_func = lambda: rng.uniform(-alpha, alpha) latitude = np.zeros((n_samples,)) longitude = np.zeros((n_samples,)) Y1 = np.zeros((n_samples,)) Y2 = np.zeros((n_samples,)) Y3 = np.zeros((n_samples,)) # sample latitude and longitude for idx in range(n_samples): # sample latitude and longitude latitude[idx] = rng.uniform(0, 2 * np.pi) longitude[idx] = np.arccos(1 - 2 * rng.uniform(0, 1)) Y1[idx] = np.sin(longitude[idx]) * np.cos(latitude[idx]) * radius + noise_func() Y2[idx] = np.sin(longitude[idx]) * np.sin(latitude[idx]) * radius + noise_func() Y3[idx] = np.cos(longitude[idx]) * radius + noise_func() if return_mi_lb: lb = alpha / 2 - np.log(alpha) return latitude, longitude, Y1, Y2, Y3, lb return latitude, longitude, Y1, Y2, Y3
[docs]def simulate_multivariate_gaussian(mean=None, cov=None, d=2, n_samples=1000, seed=1234): """Multivariate gaussian simulation for testing entropy and MI estimators. Simulates samples from a "known" multivariate gaussian distribution and then passes those samples, along with the true analytical MI/CMI. Parameters ---------- mean : array-like of shape (n_features,) The optional mean array. If None (default), a random standard normal vector is drawn. cov : array-like of shape (n_features, n_features) The covariance array. If None (default), a random standard normal 2D array is drawn. It is then converted to a PD matrix. d : int The dimensionality of the multivariate gaussian. By default 2. n_samples : int The number of samples to generate. By default 1000. seed : int The random seed to feed to :func:`numpy.random.default_rng`. Returns ------- data : array-like of shape (n_samples, n_features) The generated data from the distribution. mean : array-like of shape (n_features,) The mean vector of the distribution. cov : array-like of shape (n_features, n_features) The covariance matrix of the distribution. """ rng = np.random.default_rng(seed) if mean is None: mean = rng.normal(size=(d,)) if cov is None: # generate random covariance matrix and enure it is symmetric and positive-definite cov = rng.normal(size=(d, d)) cov = 0.5 * (cov.dot(cov.T)) if not np.all(np.linalg.eigvals(cov) > 0): raise RuntimeError("Passed in covariance matrix should be positive definite") if not scipy.linalg.issymmetric(cov): raise RuntimeError(f"Passed in covariance matrix {cov} should be symmetric") if len(mean) != d or len(cov) != d: raise RuntimeError(f"Dimensionality of mean and covariance matrix should match {d}") data = rng.multivariate_normal(mean=mean, cov=cov, size=(n_samples)) return data, mean, cov