[docs]defmake_quadratic_classification(n_samples:int,n_features:int,noise=False,seed=None):"""Simulate classification data from a quadratic model. This is a form of the simulation used in :footcite:`panda2018learning`. Parameters ---------- n_samples : int The number of samples to generate. n_features : int The number of dimensions in the dataset. noise : bool, optional Whether or not to add noise, by default False. seed : int, optional Random seed, by default None. Returns ------- x : array-like, shape (2 * n_samples, n_features) Data array. v : array-like, shape (2 * n_samples,) Target array of 1's and 0's. References ---------- .. footbibliography:: """rng=np.random.default_rng(seed)x=rng.standard_normal(size=(n_samples,n_features))coeffs=np.array([np.exp(-0.0325*(i+24))foriinrange(n_features)])eps=rng.standard_normal(size=(n_samples,n_features))x_coeffs=x*coeffsy=x_coeffs**2+noise*eps# generate the classification labelsn1=x.shape[0]n2=y.shape[0]v=np.vstack([np.zeros((n1,1)),np.ones((n2,1))])x=np.vstack((x,y))returnx,v
# Dictionary of simulations from Marron and Wand 1992# keys: names of each simulation corresponding to the class MarronWandSims# values: probabilities associated with the mixture of GaussiansMARRON_WAND_SIMS={"gaussian":[1],"skewed_unimodal":[1/5,1/5,3/5],"strongly_skewed":[1/8]*8,"kurtotic_unimodal":[2/3,1/3],"outlier":[1/10,9/10],"bimodal":[1/2]*2,"separated_bimodal":[1/2]*2,"skewed_bimodal":[3/4,1/4],"trimodal":[9/20,9/20,1/10],"claw":[1/2,*[1/10]*5],"double_claw":[49/100,49/100,*[1/350]*7],"asymmetric_claw":[1/2,*[2**(1-i)/31foriinrange(-2,3)]],"asymmetric_double_claw":[*[46/100]*2,*[1/300]*3,*[7/300]*3],"smooth_comb":[2**(5-i)/63foriinrange(6)],"discrete_comb":[*[2/7]*3,*[1/21]*3],}
[docs]defmake_marron_wand_classification(n_samples,n_dim=4096,n_informative=256,simulation:str="gaussian",rho:int=0,band_type:str="ma",return_params:bool=False,scaling_factor:float=1.0,seed=None,):"""Generate Marron-Wand binary classification dataset. The simulation is similar to that of :func:`sktree.datasets.make_trunk_classification` where the first class is generated from a multivariate-Gaussians with mean vector of 0's. The second class is generated from a mixture of Gaussians with mean vectors specified by the Marron-Wand simulations, but as the dimensionality increases, the second class distribution approaches the first class distribution by a factor of :math:`1 / sqrt(d)`. Full details for the Marron-Wand simulations can be found in :footcite:`marron1992exact`. Instead of the identity covariance matrix, one can implement a banded covariance matrix that follows :footcite:`Bickel_2008`. Parameters ---------- n_samples : int Number of sample to generate. Must be an even number, else the total number of samples generated will be ``n_samples - 1``. n_dim : int, optional The dimensionality of the dataset and the number of unique labels, by default 4096. n_informative : int, optional The informative dimensions. All others for ``n_dim - n_informative`` are Gaussian noise. Default is 256. simulation : str, optional Which simulation to run. Must be one of the following Marron-Wand simulations: 'gaussian', 'skewed_unimodal', 'strongly_skewed', 'kurtotic_unimodal', 'outlier', 'bimodal', 'separated_bimodal', 'skewed_bimodal', 'trimodal', 'claw', 'double_claw', 'asymmetric_claw', 'asymmetric_double_claw', 'smooth_comb', 'discrete_comb'. When calling the Marron-Wand simulations, only the covariance parameters are considered (`rho` and `band_type`). Means are taken from :footcite:`marron1992exact`. By default 'gaussian'. rho : float, optional The covariance value of the bands. By default 0 indicating, an identity matrix is used. band_type : str The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. Either 'ma', or 'ar'. return_params : bool, optional Whether or not to return the distribution parameters of the classes normal distributions. scaling_factor : float, optional The scaling factor for the covariance matrix. By default 1. seed : int, optional Random seed, by default None. Returns ------- X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 Trunk dataset as a dense array. y : np.ndarray of shape (n_samples,), dtype=np.intp Labels of the dataset. G : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 The mixture of Gaussians for the Marron-Wand simulations. Returned if ``return_params`` is True. w : np.ndarray of shape (n_dim,), dtype=np.float64 The weight vector for the Marron-Wand simulations. Returned if ``return_params`` is True. Notes ----- **Marron-Wand Simulations**: The Marron-Wand simulations generate two classes of data with the setup specified in the paper. Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going from one dimension to the next. For each dimension in the first distribution, there is a mean of :math:`1 / d`, where ``d`` is the dimensionality. The covariance is the identity matrix. The second distribution has a mean vector that is the negative of the first. As ``d`` increases, the two distributions become closer and closer. Full details for the trunk simulation can be found in :footcite:`trunk1982`. References ---------- .. footbibliography:: """ifn_dim<n_informative:n_informative=n_dimwarnings.warn("Number of informative dimensions {n_informative} must be less than number "f"of dimensions, {n_dim}. Setting n_informative to n_dim.",RuntimeWarning,)ifsimulationnotinMARRON_WAND_SIMS.keys():raiseValueError(f"Simulation must be: trunk, trunk_overlap, trunk_mix, {MARRON_WAND_SIMS.keys()}")rng=np.random.default_rng(seed=seed)ifrho!=0:ifband_type=="ma":cov=_moving_avg_cov(n_informative,rho)elifband_type=="ar":cov=_autoregressive_cov(n_informative,rho)else:raiseValueError(f'Band type {band_type} must be one of "ma", or "ar".')else:cov=np.identity(n_informative)# allow arbitrary uniform scaling of the covariance matrixcov=scaling_factor*cov# speed up computations for large multivariate normal matrix with SVD approximationifn_informative>1000:mvg_sampling_method="cholesky"else:mvg_sampling_method="svd"mixture_idx=rng.choice(len(MARRON_WAND_SIMS[simulation]),# type: ignoresize=n_samples//2,replace=True,p=MARRON_WAND_SIMS[simulation],)# the parameters used for each Gaussian in the mixture for each Marron Wand simulationnorm_params=MarronWandSims(n_dim=n_informative,cov=cov)(simulation)G=np.fromiter((rng_children.multivariate_normal(*(norm_params[i]),size=1,method=mvg_sampling_method)fori,rng_childreninzip(mixture_idx,rng.spawn(n_samples//2))),dtype=np.dtype((float,n_informative)),)# as the dimensionality of the simulations increasing, we are adding more and# more noise to the data using the w parameterw_vec=np.array([1.0/np.sqrt(i)foriinrange(1,n_informative+1)])# create new generator instance to ensure reproducibility with multiple runs with the same seedrng_F=np.random.default_rng(seed=seed).spawn(2)X=np.vstack((rng_F[0].multivariate_normal(np.zeros(n_informative),cov,n_samples//2,method=mvg_sampling_method),(1-w_vec)*rng_F[1].multivariate_normal(np.zeros(n_informative),cov,n_samples//2,method=mvg_sampling_method)+w_vec*G.reshape(n_samples//2,n_informative),))ifn_dim>n_informative:# create new generator instance to ensure reproducibility with multiple runs with the same seedrng_noise=np.random.default_rng(seed=seed)X=np.hstack((X,np.hstack([rng_children.normal(loc=0,scale=1,size=(X.shape[0],1))forrng_childreninrng_noise.spawn(n_dim-n_informative)]),))y=np.concatenate((np.zeros(n_samples//2),np.ones(n_samples//2)))ifreturn_params:return[X,y,*list(zip(*norm_params)),G,w_vec]returnX,y
[docs]defmake_trunk_mixture_classification(n_samples,n_dim=4096,n_informative=256,mu_0:int=0,mu_1:int=1,rho:int=0,band_type:str="ma",return_params:bool=False,mix:float=0.5,scaling_factor:float=1.0,seed=None,):"""Generate trunk mixture binary classification dataset. The first class is generated from a multivariate-Gaussians with mean vector of 0's. The second class is generated from a mixture of Gaussians with mean vectors specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, which is the probability of the first Gaussian in the mixture. For each dimension in the first distribution, there is a mean of :math:`1 / d`, where ``d`` is the dimensionality. The covariance is the identity matrix. The second distribution has a mean vector that is the negative of the first. As ``d`` increases, the two distributions become closer and closer. Full details for the trunk simulation can be found in :footcite:`trunk1982`. Instead of the identity covariance matrix, one can implement a banded covariance matrix that follows :footcite:`Bickel_2008`. Parameters ---------- n_samples : int Number of sample to generate. Must be an even number, else the total number of samples generated will be ``n_samples - 1``. n_dim : int, optional The dimensionality of the dataset and the number of unique labels, by default 4096. n_informative : int, optional The informative dimensions. All others for ``n_dim - n_informative`` are Gaussian noise. Default is 256. mu_0 : int, optional The mean of the first distribution. By default -1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. mu_1 : int, optional The mean of the second distribution. By default 1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. rho : float, optional The covariance value of the bands. By default 0 indicating, an identity matrix is used. band_type : str The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. Either 'ma', or 'ar'. return_params : bool, optional Whether or not to return the distribution parameters of the classes normal distributions. mix : int, optional The probabilities associated with the mixture of Gaussians in the ``trunk-mix`` simulation. By default 0.5. scaling_factor : float, optional The scaling factor for the covariance matrix. By default 1. seed : int, optional Random seed, by default None. Returns ------- X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 Trunk dataset as a dense array. y : np.ndarray of shape (n_samples,), dtype=np.intp Labels of the dataset. means : list of ArrayLike of shape (n_dim,), dtype=np.float64 The mean vector for each class starting with class 0. Returned if ``return_params`` is True. covs : list of ArrayLike of shape (n_dim, n_dim), dtype=np.float64 The covariance for each class. Returned if ``return_params`` is True. X_mixture : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 The mixture of Gaussians. Returned if ``return_params`` is True. Notes ----- **Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality increases. This is implemented by decreasing the mean of the distribution by a factor of ``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1, for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on. **Trunk Mix**: The trunk mix simulation generates two classes of data with the same covariance matrix. The first class (label 0) is generated from a multivariate-Gaussians with mean vector of zeros and the second class is generated from a mixture of Gaussians with mean vectors specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, which is the probability of the first Gaussian in the mixture. Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going from one dimension to the next. References ---------- .. footbibliography:: """ifn_dim<n_informative:n_informative=n_dimwarnings.warn("Number of informative dimensions {n_informative} must be less than number "f"of dimensions, {n_dim}. Setting n_informative to n_dim.",RuntimeWarning,)ifmix<0ormix>1:# type: ignoreraiseValueError("Mix must be between 0 and 1.")rng=np.random.default_rng(seed=seed)mu_1_vec=np.array([mu_1/np.sqrt(i)foriinrange(1,n_informative+1)])mu_0_vec=np.array([mu_0/np.sqrt(i)foriinrange(1,n_informative+1)])ifrho!=0:ifband_type=="ma":cov=_moving_avg_cov(n_informative,rho)elifband_type=="ar":cov=_autoregressive_cov(n_informative,rho)else:raiseValueError(f'Band type {band_type} must be one of "ma", or "ar".')else:cov=np.identity(n_informative)# Note: When variance is 1, trunk-mix does not look bimodal at low dimensions.# It is set it to (2/3)**2 since that is consistent with Marron and Wand bimodalcov=scaling_factor*cov# speed up computations for large multivariate normal matrix with SVD approximationifn_informative>1000:method="cholesky"else:method="svd"mixture_idx=rng.choice(2,n_samples//2,replace=True,shuffle=True,p=[mix,1-mix])# type: ignorenorm_params=[[mu_0_vec,cov],[mu_1_vec,cov]]dim_sample=len(norm_params[0][0])# Dimensionality of the samplesX_mixture=np.empty((n_samples//2,dim_sample))# Pre-allocate array for samplesforidx,(i,rng_child)inenumerate(zip(mixture_idx,rng.spawn(n_samples//2))):mean,cov=norm_params[i]X_mixture[idx,:]=rng_child.multivariate_normal(mean,cov,size=1,method=method)# create new generator instance to ensure reproducibility with multiple runs with the same seedrng_F=np.random.default_rng(seed=seed)X=np.vstack((rng_F.multivariate_normal(np.zeros(n_informative),cov,n_samples//2,method=method),X_mixture.reshape(n_samples//2,n_informative),))ifn_dim>n_informative:# create new generator instance to ensure reproducibility with multiple runs with the same seedrng_noise=np.random.default_rng(seed=seed)X=np.hstack((X,np.hstack([rng_children.normal(loc=0,scale=1,size=(X.shape[0],1))forrng_childreninrng_noise.spawn(n_dim-n_informative)]),))y=np.concatenate((np.zeros(n_samples//2),np.ones(n_samples//2)))ifreturn_params:return[X,y,*list(zip(*norm_params)),X_mixture]returnX,y
[docs]defmake_trunk_classification(n_samples,n_dim=4096,n_informative=256,mu_0:int=0,mu_1:int=1,rho:int=0,band_type:str="ma",return_params:bool=False,scaling_factor:float=1.0,seed=None,):"""Generate trunk binary classification dataset. For each dimension in the first distribution, there is a mean of :math:`1 / d`, where ``d`` is the dimensionality. The covariance is the identity matrix. The second distribution has a mean vector that is the negative of the first. As ``d`` increases, the two distributions become closer and closer. Full details for the trunk simulation can be found in :footcite:`trunk1982`. Instead of the identity covariance matrix, one can implement a banded covariance matrix that follows :footcite:`Bickel_2008`. Parameters ---------- n_samples : int Number of sample to generate. Must be an even number, else the total number of samples generated will be ``n_samples - 1``. n_dim : int, optional The dimensionality of the dataset and the number of unique labels, by default 4096. n_informative : int, optional The informative dimensions. All others for ``n_dim - n_informative`` are Gaussian noise. Default is 256. mu_0 : int, optional The mean of the first distribution. By default -1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. mu_1 : int, optional The mean of the second distribution. By default 1. The mean of the distribution will decrease by a factor of ``sqrt(i)`` for each dimension ``i``. rho : float, optional The covariance value of the bands. By default 0 indicating, an identity matrix is used. band_type : str The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. Either 'ma', or 'ar'. return_params : bool, optional Whether or not to return the distribution parameters of the classes normal distributions. Default false. scaling_factor : float, optional The scaling factor for the covariance matrix. By default 1. seed : int, optional Random seed, by default None. Returns ------- X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 Trunk dataset as a dense array. y : np.ndarray of shape (n_samples,), dtype=np.intp Labels of the dataset. means : list of ArrayLike of shape (n_dim,), dtype=np.float64 The mean vector for each class starting with class 0. Returned if ``return_params`` is True. covs : list of ArrayLike of shape (n_dim, n_dim), dtype=np.float64 The covariance for each class. Returned if ``return_params`` is True. Notes ----- **Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality increases. This is implemented by decreasing the mean of the distribution by a factor of ``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1, for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on. **Trunk Overlap**: The trunk overlap simulation generates two classes of data with the same covariance matrix and mean vector of zeros. Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going from one dimension to the next. References ---------- .. footbibliography:: """ifn_dim<n_informative:n_informative=n_dimwarnings.warn("Number of informative dimensions {n_informative} must be less than number "f"of dimensions, {n_dim}. Setting n_informative to n_dim.",RuntimeWarning,)rng=np.random.default_rng(seed=seed)mu_1_vec=np.array([mu_1/np.sqrt(i)foriinrange(1,n_informative+1)])mu_0_vec=np.array([mu_0/np.sqrt(i)foriinrange(1,n_informative+1)])ifrho!=0:ifband_type=="ma":cov=_moving_avg_cov(n_informative,rho)elifband_type=="ar":cov=_autoregressive_cov(n_informative,rho)else:raiseValueError(f'Band type {band_type} must be one of "ma", or "ar".')else:cov=np.identity(n_informative)# allow arbitrary uniform scaling of the covariance matrixcov=scaling_factor*cov# speed up computations for large multivariate normal matrix with SVD approximationifn_informative>1000:method="cholesky"else:method="svd"X=np.vstack([rng_children.multivariate_normal(mu_vec,cov,n_samples//2,method=method)forrng_children,mu_vecinzip(rng.spawn(2),[mu_0_vec,mu_1_vec])])ifn_dim>n_informative:X=np.hstack((X,np.hstack([rng_children.normal(loc=0,scale=1,size=(X.shape[0],1))forrng_childreninrng.spawn(n_dim-n_informative)]),))y=np.concatenate((np.zeros(n_samples//2),np.ones(n_samples//2)))ifreturn_params:return[X,y,[mu_0_vec,mu_1_vec],[cov,cov]]returnX,y
classMarronWandSims:def__init__(self,n_dim=1,cov=1):self.n_dim=n_dimself.cov=covdef__call__(self,simulation):sims=self._my_method_generator()ifsimulationinsims.keys():returnsims[simulation]()else:raiseValueError(f"simulation is not one of these: {sims.keys()}")def_my_method_generator(self):return{method:getattr(self,method)formethodindir(self)ifnotmethod.startswith("__")}defgaussian(self):return[[np.zeros(self.n_dim),self.cov]]defskewed_unimodal(self):return[[np.zeros(self.n_dim),self.cov],[np.full(self.n_dim,1/2),self.cov*(2/3)**2],[np.full(self.n_dim,13/12),self.cov*(5/9)**2],]defstrongly_skewed(self):return[[np.full(self.n_dim,3*((2/3)**l_mix-1)),self.cov*(2/3)**(2*l_mix)]forl_mixinrange(8)]defkurtotic_unimodal(self):return[[np.zeros(self.n_dim),self.cov],[np.zeros(self.n_dim),self.cov*(1/10)**2]]defoutlier(self):return[[np.zeros(self.n_dim),self.cov],[np.zeros(self.n_dim),self.cov*(1/10)**2]]defbimodal(self):return[[-np.ones(self.n_dim),self.cov*(2/3)**2],[np.ones(self.n_dim),self.cov*(2/3)**2],]defseparated_bimodal(self):return[[-np.full(self.n_dim,3/2),self.cov*(1/2)**2],[np.full(self.n_dim,3/2),self.cov*(1/2)**2],]defskewed_bimodal(self):return[[np.zeros(self.n_dim),self.cov],[np.full(self.n_dim,3/2),self.cov*(1/3)**2],]deftrimodal(self):return[[np.full(self.n_dim,-6/5),self.cov*(3/5)**2],[np.full(self.n_dim,6/5),self.cov*(3/5)**2],[np.zeros(self.n_dim),self.cov*(1/4)**2],]defclaw(self):return[[np.zeros(self.n_dim),self.cov],*[[np.full(self.n_dim,(l_mix/2)-1),self.cov*(1/10)**2]forl_mixinrange(5)],]defdouble_claw(self):return[[-np.ones(self.n_dim),self.cov*(2/3)**2],[np.ones(self.n_dim),self.cov*(2/3)**2],*[[np.full(self.n_dim,(l_mix-3)/2),self.cov*(1/100)**2]forl_mixinrange(7)],]defasymmetric_claw(self):return[[np.zeros(self.n_dim),self.cov],*[[np.full(self.n_dim,l_mix+1/2),self.cov*(1/((2**l_mix)*10))**2]forl_mixinrange(-2,3)],]defasymmetric_double_claw(self):return[*[[np.full(self.n_dim,2*l_mix-1),self.cov*(2/3)**2]forl_mixinrange(2)],*[[-np.full(self.n_dim,l_mix/2),self.cov*(1/100)**2]forl_mixinrange(1,4)],*[[np.full(self.n_dim,l_mix/2),self.cov*(7/100)**2]forl_mixinrange(1,4)],]defsmooth_comb(self):return[[np.full(self.n_dim,(65-96*((1/2)**l_mix))/21),self.cov*(32/63)**2/(2**(2*l_mix)),]forl_mixinrange(6)]defdiscrete_comb(self):return[*[[np.full(self.n_dim,(12*l_mix-15)/7),self.cov*(2/7)**2]forl_mixinrange(3)],*[[np.full(self.n_dim,(2*l_mix)/7),self.cov*(1/21)**2]forl_mixinrange(8,11)],]def_moving_avg_cov(n_dim,rho):# Create a meshgrid of indicesi,j=np.meshgrid(np.arange(1,n_dim+1),np.arange(1,n_dim+1),indexing="ij")# Calculate the covariance matrix using the corrected formulacov_matrix=rho**np.abs(i-j)# Apply the banding conditioncov_matrix[abs(i-j)>1]=0returncov_matrixdef_autoregressive_cov(n_dim,rho):# Create a meshgrid of indicesi,j=np.meshgrid(np.arange(1,n_dim+1),np.arange(1,n_dim+1),indexing="ij")# Calculate the covariance matrix using the corrected formulacov_matrix=rho**np.abs(i-j)returncov_matrix
[docs]defapproximate_clf_mutual_information(means,covs,class_probs=[0.5,0.5],base=np.exp(1),seed=None):"""Approximate MI for multivariate Gaussian for a classification setting. Parameters ---------- means : list of ArrayLike of shape (n_dim,) A list of means to sample from for each class. covs : list of ArrayLike of shape (n_dim, n_dim) A list of covariances to sample from for each class. class_probs : list, optional List of class probabilities, by default [0.5, 0.5] for balanced binary classification. base : float, optional The bit base to use, by default np.exp(1) for natural logarithm. seed : int, optional Random seed for the multivariate normal, by default None. Returns ------- I_XY : float Estimated mutual information. H_X : float Estimated entropy of X, the mixture of multivariate Gaussians. H_XY : float The conditional entropy of X given Y. int_err : float The integration error for ``H_X``. """# this implicitly assumes that the signal of interest is between -10 and 10scale=10n_dims=[cov.shape[1]forcovincovs]lims=[[-scale,scale]]*max(n_dims)# Compute entropy and X and Y.deffunc(*args):x=np.array(args)p=0forkinrange(len(means)):p+=class_probs[k]*multivariate_normal.pdf(x,means[k],covs[k])return-p*np.log(p)/np.log(base)# numerically integrate H(X)# opts = dict(limit=1000)H_X,int_err=nquad(func,lims)# Compute MI.H_XY=0forkinrange(len(means)):H_XY+=(class_probs[k]*(n_dims[k]*np.log(2*np.pi)+np.log(np.linalg.det(covs[k]))+n_dims[k])/(2*np.log(base)))I_XY=H_X-H_XYreturnI_XY,H_X,H_XY,int_err
[docs]defapproximate_clf_mutual_information_with_monte_carlo(means,covs,n_samples=100_000,class_probs=[0.5,0.5],base=np.exp(1),seed=None):"""Approximate MI for multivariate Gaussian for a classification setting. Parameters ---------- means : list of ArrayLike of shape (n_dim,) A list of means to sample from for each class. covs : list of ArrayLike of shape (n_dim, n_dim) A list of covariances to sample from for each class. n_samples : int The total number of simulation samples class_probs : list, optional List of class probabilities, by default [0.5, 0.5] for balanced binary classification. base : float, optional The bit base to use, by default np.exp(1) for natural logarithm. seed : int, optional Random seed for the multivariate normal, by default None. Returns ------- I_XY : float Estimated mutual information. H_Y : float Estimated entropy of Y, the class labels. H_Y_on_X : float The conditional entropy of Y given X. """rng=np.random.default_rng(seed=seed)P_Y=class_probs# Generate samplespdf_class=[]X=[]foriinrange(len(means)):pdf_class.append(multivariate_normal(means[i],covs[i],allow_singular=True))X.append(rng.multivariate_normal(means[i],covs[i],size=int(n_samples*P_Y[i])).reshape(-1,1))X=np.vstack(X)# Calculate P(X) by law of total probabilityP_X_l=[]P_X_on_Y=[]foriinrange(len(means)):P_X_on_Y.append(pdf_class[i].pdf(X))P_X_l.append(P_X_on_Y[-1]*P_Y[i])P_X=sum(P_X_l)# Calculate P(Y|X) by Bayes' theoremP_Y_on_X=[]foriinrange(len(means)):P_Y_on_X.append((P_X_on_Y[i]*P_Y[i]/P_X).reshape(-1,1))P_Y_on_X=np.hstack(P_Y_on_X)P_Y_on_X=P_Y_on_X[~np.isnan(P_Y_on_X)].reshape(-1,2)# Calculate the entropy of Y by class countsH_Y=entropy(P_Y,base=base)# Calculate the conditional entropy of Y on XH_Y_on_X=np.mean(entropy(P_Y_on_X,base=base,axis=1))MI=H_Y-H_Y_on_XreturnMI,H_Y,H_Y_on_X
def_compute_mi_bounds(means,covs,class_probs):# compute bounds using https://arxiv.org/pdf/2101.11670.pdfprior_y=np.array(class_probs)H_Y=entropy(prior_y,base=np.exp(1))# number of mixturesN=len(class_probs)cov=covs[0]H_YX_lb=0.0H_YX_ub=0.0foridxinrange(N):num=0.0denom=0.0mean_i=means[idx]forjdxinrange(N):mean_j=means[jdx]c_alpha,kl_div=_compute_c_alpha_and_kl(mean_i,mean_j,cov,alpha=0.5)num+=class_probs[jdx]*np.exp(-c_alpha)forkdxinrange(N):mean_k=means[kdx]c_alpha,kl_div=_compute_c_alpha_and_kl(mean_i,mean_k,cov,alpha=0.5)denom+=class_probs[jdx]*np.exp(-kl_div)H_YX_lb+=class_probs[idx]*np.log(num/denom)H_YX_ub+=class_probs[idx]*np.log(denom/num)I_lb=H_Y-H_YX_lbI_ub=H_Y-H_YX_ubreturnI_lb,I_ubdef_compute_c_alpha_and_kl(mean_i,mean_j,cov,alpha=0.5):mean_ij=mean_i-mean_jlambda_ij=mean_ij.T@np.linalg.inv(cov)@mean_ijreturnalpha*(1.0-alpha)*lambda_ij/2.0,lambda_ij/2