import pandas as pd
import graspologic as gp
import seaborn as sns
import hyppo
import matplotlib.pyplot as plt
import numpy as np

import plotly.graph_objects as go
import matplotlib.cm

from sklearn.preprocessing import LabelEncoder

from graspologic.embed import OmnibusEmbed, select_svd

import plotly.io as pio
pio.renderers.default="png"
## Read the data

key = pd.read_csv('../data/processed/key.csv')
data = pd.read_csv('../data/processed/mouses-volumes.csv')
fa = pd.read_csv('../data/processed/mouses-fa.csv')

data.set_index(key.DWI, inplace=True)
fa.set_index(key.DWI, inplace=True)

genotypes = ['APOE22', 'APOE33', 'APOE44']


gen_animals = {genotype: None for genotype in genotypes}

for genotype in genotypes:
    gen_animals[genotype] = key.loc[key['Genotype'] == genotype]['DWI'].tolist()

vol_dat = {genotype: [] for genotype in genotypes}
fa_dat = {genotype: [] for genotype in genotypes}
for genotype in genotypes:
    vol_dat[genotype] = data.loc[gen_animals[genotype]].to_numpy()
    fa_dat[genotype] = fa.loc[gen_animals[genotype]].to_numpy()
    
## Compute correlations
vol_cor = {genotype: gp.utils.symmetrize(np.corrcoef(dat, rowvar=False)) for (genotype, dat) in vol_dat.items()}
fa_cor = {genotype: gp.utils.symmetrize(np.corrcoef(dat, rowvar=False)) for (genotype, dat) in fa_dat.items()}

## Plot to make sure nothing is wrong

fig, ax = plt.subplots(1, 3, figsize=(18, 5))
for (i, genotype) in enumerate(vol_cor.keys()):
    fig.suptitle("Volume Correlations", fontsize=20)
    gp.plot.heatmap(vol_cor[genotype], title=genotype, ax=ax[i], vmin=-1, vmax=1)

fig, ax = plt.subplots(1, 3, figsize=(18, 5))
for (i, genotype) in enumerate(fa_cor.keys()):
    fig.suptitle("FA Correlations",  fontsize=20)
    gp.plot.heatmap(fa_cor[genotype], title=genotype, ax=ax[i], vmin=-1, vmax=1)
    
    

## Node hierarchical label data
node_labels = pd.read_csv("../data/processed/node_label_dictionary.csv")
_images/bec67794fb4f41194173415c8e1a0710c616f58c5ef764a64021b87748706c92.png _images/8fc5ba9c2122af216ce58e139e449d8cc60c65b4dd5f6a46ca371473afa22bd4.png

Test whether brain volume and FA are independent#

If FA and brain volume values are independent, it suggest that FA does not contain any information about brain volume, suggesting that using FA should not yield any different results

pvals = []

for genotype in genotypes:
    dcorr = hyppo.independence.Dcorr()
    stat, pval = dcorr.test(vol_dat[genotype].T, fa_dat[genotype].T)
    pvals.append([genotype, pval])
    
pval_df = pd.DataFrame(pvals, columns=["Genotype", "P-value"])

pval_df
Genotype P-value
0 APOE22 0.000121
1 APOE33 0.000197
2 APOE44 0.000104

These p-values suggest that FA and brain volume are depedent.

Testing whether brain volume and FA correlations are independent#

Similar analysis, but we are evaluating the correlations

pvals = []

for genotype in genotypes:
    dcorr = hyppo.independence.Dcorr(compute_distance=None)
    stat, pval = dcorr.test(vol_cor[genotype].T, fa_cor[genotype].T)
    pvals.append([genotype, pval])
    
pval_df = pd.DataFrame(pvals, columns=["Genotype", "P-value"])

pval_df
Genotype P-value
0 APOE22 0.000117
1 APOE33 0.000002
2 APOE44 0.000253

Again, the p-values suggest that brain volume and FA correlations are not independent

Using omnibus embedding and multiple adjacency spectral embedding to simulataneously embed brain volume and FA#

omni = OmnibusEmbed()

embeddings = [omni.fit_transform([vol_cor[genotype], fa_cor[genotype]])[0] for genotype in genotypes]

embeddings = np.hstack(embeddings)
embeddings.shape
(332, 13)
U, D, V = select_svd(embeddings)
D.shape
(2,)

Hierarchical clustering on the embeddings#

cluster = gp.cluster.DivisiveCluster(max_level=3)

cluster_labels = cluster.fit_predict(U, fcluster=True)

cluster_label_df = pd.DataFrame(cluster_labels, columns=["cluster_level_1", "cluster_level_2", "cluster_level_3"])
def count_groups(label_matrix):
    levels = label_matrix.shape[1] - 1
    d = []

    for level in range(levels):
        upper_cluster_ids = np.unique(label_matrix[:, level])
        
        for upper_cluster_id in upper_cluster_ids:
            lower_cluster_ids, counts = np.unique(
                label_matrix[label_matrix[:, level] == upper_cluster_id][:, level + 1], return_counts=True
            )

            for idx, lower_cluster_id in enumerate(lower_cluster_ids):
                if upper_cluster_id == lower_cluster_id:
                    lower_cluster_id = None
                d.append((upper_cluster_id, lower_cluster_id, counts[idx]))

    d = np.array(d)
    
    source = d[:, 0]
    target = d[:, 1]
    value = d[:, 2]
    
    return source, target, value

def append_apriori_labels(apriori_labels, cluster_matrix):
    encoder = LabelEncoder()
    apriori_labels_encoded = encoder.fit_transform(apriori_labels)
    apriori_labels_encoded = apriori_labels_encoded.reshape(-1, 1)
    
    # Increase the original cluster_matrix labels
    cluster_matrix_ = cluster_matrix + np.max(apriori_labels_encoded) + 1
    
    out = np.hstack([apriori_labels_encoded, cluster_matrix_])
    
    return out, list(encoder.classes_)

hemispheric_clusters, encoded_labels = append_apriori_labels(node_labels.Hemisphere, cluster_labels)

source, target, value = count_groups(hemispheric_clusters)

Visualizing Sankey#

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = encoded_labels + [f"Cluster {i}" for i in range(np.max(hemispheric_clusters))]
    ),
    link = dict(
      source = source,
      target = target,
      value = value
  ))])

fig.update_layout(title_text="Hemispheric Clustering", font_size=10)
fig.show(dpi=300, width=1000, height=600)
_images/7c48ab36749b9e30ecea529866065e7e32ce96d209d5e5b98a05e65beefc7270.png
level_1_clusters, encoded_labels = append_apriori_labels(node_labels.Level_1, cluster_labels)

source, target, value = count_groups(level_1_clusters)
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = encoded_labels + [f"Cluster {i}" for i in range(np.max(level_1_clusters[0]))],
    ),
    link = dict(
      source = source,
      target = target,
      value = value
  ))])

fig.update_layout(title_text="Level 1 Clustering", font_size=10)
fig.show(dpi=300, width=1000, height=600)
_images/9b0d9fb5d87e85d72f3809a943bf13c838d3dd54aed4c6ae0df0be180ada09c5.png
cl = pd.DataFrame(cluster_labels, columns=[f"Cluster_{i}" for i in range(1, 4)])

meta = pd.concat([node_labels, cl], axis=1)

meta.head()

## Plot to make sure nothing is wrong
for l in range(3):
    fig, ax = plt.subplots(
        ncols=3, 
        figsize=(20, 10), 
        #constrained_layout=True, 
        dpi=300,
        gridspec_kw=dict(width_ratios=[1, 1, 1])
    )
    
    for (i, genotype) in enumerate(vol_cor.keys()):
        gp.plot.adjplot(
            vol_cor[genotype], 
            ax=ax[i],
            vmin=-1,
            vmax=1,
            meta=meta,
            group=[f'Cluster_{l+1}'],
        )
        ax[i].set_title(f"{genotype}", pad=90, size=30)
        
    fig.savefig(f"./figures/2022-02-02-multigraph-clustering-level-{l + 1}.png", bbox_inches='tight')
_images/114e14b9804038e7c01a0312365ffe7e6762437023fc4c979d9132ed8976d96c.png _images/7f6bd982f5dcfa6dd9603568e9ee0a4221c958457daf332a1622a012d525ff67.png _images/ac609a1efecd9561d162fdc2d6af40211cceb49c708c6570722ce7d28c32aaec.png

Comparing results to prior clusterings#

mase = gp.embed.MultipleASE()
Vhat = mase.fit_transform([corr for _, corr in vol_cor.items()])
cluster = gp.cluster.DivisiveCluster(max_level=3)

cluster_labels = cluster.fit_predict(Vhat, fcluster=True)

cluster_label_df_ = pd.DataFrame(cluster_labels, columns=["cluster_level_1", "cluster_level_2", "cluster_level_3"])
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(cluster_label_df_.cluster_level_3, cluster_label_df.cluster_level_3)
0.6562743402239853