Brain Volume Analysis

Brain Volume Analysis#

import pandas as pd
import graspologic as gp
import seaborn as sns
import hyppo
import matplotlib.pyplot as plt
import numpy as np
import plotnine as p9

## Read the data

key = pd.read_csv('../data/processed/key.csv')
data = pd.read_csv('../data/processed/mouses-volumes.csv')

data.set_index(key.DWI, inplace=True)

genotypes = ['APOE22', 'APOE33', 'APOE44']

gen_animals = {genotype: None for genotype in genotypes}

for genotype in genotypes:
    gen_animals[genotype] = key.loc[key['Genotype'] == genotype]['DWI'].tolist()

vol_dat = {genotype: [] for genotype in genotypes}
    
for genotype in genotypes:
    vol_dat[genotype] = data.loc[gen_animals[genotype]].to_numpy()[:,1:]
    
## Compute correlations
cor_dat = {genotype: gp.utils.symmetrize(np.corrcoef(dat, rowvar=False)) for (genotype, dat) in vol_dat.items()}

## Plot to make sure nothing is wrong
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

for (i, genotype) in enumerate(cor_dat.keys()):
    gp.plot.heatmap(cor_dat[genotype], title=genotype, ax=ax[i], vmin=-1, vmax=1)

_images/4338db5ee421298f0ef12ad4375df868b5164da019fd34a2ae43f8842a790fba.png

Testing whether group volumes are significantly different#

We compute 3-sample distance correlation to see if there is any differences in the 3 groups. Specifically, we test whether the distribution of brain volumes from at least two genotypes are different.

ksample = hyppo.ksample.KSample("dcorr")

stat, pval = ksample.test(*[dat for (genotype, dat) in vol_dat.items()])

print(pval)

0.006007509501547491

We see significant difference among the three genotypes

Testing for most significantly different regions#

Since we detect difference in at least two of the genotypes using the whole brain volumes, we repeat the procedure on each of the regions.

region_pvals = {reg: None for reg in data.columns[1:]}

for i, reg in enumerate(data.columns[1:]):
    region_vols = [dat[:, i] for (genotype, dat) in vol_dat.items()]
    stat, pval = ksample.test(*region_vols)

    region_pvals[reg] = pval

sorted_pvals = dict(sorted(region_pvals.items(), key=lambda item: item[1]))

pd.DataFrame(list(sorted_pvals.items())[:10], columns = ['Region', 'P-value'])

	Region	P-value
0	ml.1	0.000590
1	DTT	0.000763
2	PnC	0.001241
3	Cb_1_10.1	0.001744
4	PnRt	0.002679
5	Vll.1	0.003626
6	ml	0.004092
7	DTT.1	0.004172
8	SC.1	0.004264
9	ParB	0.005457

Testing for differences in the brain volume covariance#

Using similar procedure as above, we test whether the covariances are different among the genotypes. Prior to the 3-sample test, we use Omnibus embedding on the covariance matrices to obtain latent positions, which we then test for difference in distributions.

omni = gp.embed.OmnibusEmbed()

omni.fit([dat for (_, dat) in cor_dat.items()])

Xhats = omni.latent_left_
print(Xhats.shape)

(3, 332, 4)

ksample = hyppo.ksample.KSample("dcorr")

_, pval = ksample.test(*Xhats)

print(pval)

8.254124554923335e-26

We again see a significant difference in distributions

Testing for differences in pairwise covariances#

We then test for whether the distribution of each pair of genotypes are different.

genotypes = ['APOE22', 'APOE33', 'APOE44']

ksample = hyppo.ksample.KSample("dcorr")


res = []

for i, g in enumerate(genotypes):
    for j, h in enumerate(genotypes):
        if g == h:
            res.append([g, h, 1])
        else:
            _, pval = ksample.test(Xhats[i], Xhats[j])
            res.append([g, h, pval])

pairwise_df = pd.DataFrame(res, columns=["Genotype 1", "Genotype 2", "pvalue"])

pairwise_df["Genotype 1"] = pairwise_df["Genotype 1"].astype('category')
pairwise_df["Genotype 2"] = pairwise_df["Genotype 2"].astype('category')
pairwise_df["log(pvalue)"] = np.log(pairwise_df["pvalue"])

pairwise_df

	Genotype 1	Genotype 2	pvalue	log(pvalue)
0	APOE22	APOE22	1.000000e+00	0.000000
1	APOE22	APOE33	9.932969e-21	-46.058427
2	APOE22	APOE44	1.014474e-17	-39.129576
3	APOE33	APOE22	9.932969e-21	-46.058427
4	APOE33	APOE33	1.000000e+00	0.000000
5	APOE33	APOE44	3.573940e-16	-35.567693
6	APOE44	APOE22	1.014474e-17	-39.129576
7	APOE44	APOE33	3.573940e-16	-35.567693
8	APOE44	APOE44	1.000000e+00	0.000000

plot = (p9.ggplot(p9.aes(x="Genotype 1", y="Genotype 2", fill="log(pvalue)")) +
  p9.geom_tile(pairwise_df))

plot

_images/fcbb584fd71bfdb0038514724a7ac9fb2c495e12de9f1479bb186a8be2ffc2d7.png

<ggplot: (8766045953917)>