Source code for sktree.datasets.hyppo

import numpy as np



[docs]
def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
    """Simulate classification data from a quadratic model.

    This is a form of the simulation used in :footcite:`panda2018learning`.

    Parameters
    ----------
    n_samples : int
        The number of samples to generate.
    n_features : int
        The number of dimensions in the dataset.
    noise : bool, optional
        Whether or not to add noise, by default False.
    seed : int, optional
        Random seed, by default None.

    Returns
    -------
    x : array-like, shape (2 * n_samples, n_features)
        Data array.
    v : array-like, shape (2 * n_samples,)
        Target array of 1's and 0's.

    References
    ----------
    .. footbibliography::
    """
    rng = np.random.default_rng(seed)

    x = rng.standard_normal(size=(n_samples, n_features))
    coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(n_features)])
    eps = rng.standard_normal(size=(n_samples, n_features))

    x_coeffs = x * coeffs
    y = x_coeffs**2 + noise * eps

    # generate the classification labels
    n1 = x.shape[0]
    n2 = y.shape[0]
    v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
    x = np.vstack((x, y))
    return x, v