Replicate Fig 1#

In this notebook we will be replicating figure 1 of the paper Whole-animal connectome and cell-type complement of the three-segmented Platynereis dumerilii larva, which is a network plot of the neurons in the larva’s connectome

import os
import logging
import pandas as pd
import numpy as np
import networkx as nx
from itertools import chain, combinations
from upsetplot import plot
from matplotlib import pyplot as plt
from networkx import from_numpy_array, number_of_nodes, number_of_edges, density
from graspologic.embed import AdjacencySpectralEmbed
from graspologic.layouts import layout_tsne, layout_umap
from graspologic.plot.plot import networkplot
from graspologic.utils import is_fully_connected, largest_connected_component, is_symmetric, symmetrize
from pkg.platy import load_connectome_adj, _get_folder
/Users/kareefullah/Library/Caches/pypoetry/virtualenvs/platy-data-EVeqgmAk-py3.9/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
        skids side class segment   type  group
0     2015233    l     s    head  100.0    NaN
1     1548290    l   NaN       1    NaN    NaN
2     1318919    l     s    head   88.0   15.0
3     2015241    l     s    head  100.0    NaN
4     1646603    r   NaN       3    NaN    NaN
...       ...  ...   ...     ...    ...    ...
2696  1302513    l     s    head    NaN    NaN
2697  1630186    l   NaN       2    NaN    NaN
2698  1441779    r   NaN    head    NaN    NaN
2699  1671147    r     m       1  165.0    NaN
2700  1048573    l     i       3    NaN    NaN

[2701 rows x 6 columns]
      skids side class   segment   type  group
0   2007060   lr     i      head    NaN    NaN
1    139303   lr     s         1  102.0    NaN
2   2007344    l    im      head  166.0    NaN
3   1016172    l    im         1    NaN    NaN
4   1999932    r    sm         1   95.0    NaN
5   1320159   rc     i      head    NaN    NaN
6   1631592   lc   NaN  pygidium    NaN    NaN
7   1418928   lc     s      head  132.0    NaN
8   1320798   lc     s      head    NaN    NaN
9    370978    r    sm  pygidium   54.0    9.0
10    84457   rc   NaN      head    NaN    NaN
11  1462145    l    im         1    NaN    NaN
12   307194   rc     s      head  135.0    NaN
13  1462664   rc   NaN         2    NaN    NaN
14  1594313    r    sm  pygidium   93.0    9.0
15  1332516   rc     s      head   49.0    NaN
16  1366160   rc     i  pygidium    NaN    1.0
17    64258   lr     s         1   36.0    NaN
18  1276971    l    sm         2   80.0    NaN
19    81251   lc     s      head   43.0    NaN
20  1302189    r    im      head   11.0    NaN
21  1294206   lc     i      head    NaN    NaN
folder = _get_folder()

We import the adjacency matrix for the connection of connectome neurons and save its edgelist (note: we are working with a directed graph)

adj = load_connectome_adj()
g = nx.from_pandas_adjacency(adj, create_using=nx.DiGraph)
nx.write_edgelist(g, folder / "weighted_edgelist.csv")

To be able to make a network plot of the connectome neurons, we need to extract the largest connected component of the neurons, as there are a few neurons that have a degree of 0 (no incident edges), which we cannot plot

#numpy representation of original graph
adj_numpy_orig = adj.to_numpy()

#numpy largest connected component
adj_numpy_lcc, new_inds = largest_connected_component(adj_numpy_orig, return_inds=True)
#symmetrized version of numpy_lcc
adj_numpy_sym_lcc = symmetrize(adj_numpy_lcc)

#networkx representation of symmetrized lcc graph
adj_nx_sym_lcc = nx.from_numpy_array(adj_numpy_sym_lcc, create_using=nx.DiGraph)
print(adj_nx_sym_lcc)
DiGraph with 2723 nodes and 21691 edges
#use layout_tsne to get 2d representation of graph
X, node_pos = layout_tsne(adj_nx_sym_lcc, perplexity=100, n_iter=10000)
x_pos = []
y_pos = []
node_comms = []
for pos in node_pos:

    #save the assigned community of the node
    node_comms.append(pos[4])
    x_pos.append(pos[1])
    y_pos.append(pos[2])

node_comms = np.array(node_comms)
x_pos = np.array(x_pos)
y_pos = np.array(y_pos)
degrees = np.sum(adj_numpy_sym_lcc, axis=0)
/Users/kareefullah/Library/Caches/pypoetry/virtualenvs/platy-data-EVeqgmAk-py3.9/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:800: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/kareefullah/Library/Caches/pypoetry/virtualenvs/platy-data-EVeqgmAk-py3.9/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:810: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
WARNING:graspologic.layouts.auto:Directed graph converted to undirected graph for community detection
plot = networkplot(
    adjacency=adj_numpy_sym_lcc,
    x=x_pos,
    y=y_pos,
    node_hue=node_comms,
    palette="tab20",
    node_size=degrees,
    node_sizes=(20, 200),
    edge_hue="source",
    edge_alpha=0.5,
    edge_linewidth=0.5,
)
plt.savefig(folder / "tsne_connectome.png")
plt.show()
_images/make_fig1_10_0.png
#use layout_umap to get 2d representation of graph
X, node_pos = layout_umap(adj_nx_sym_lcc)
x_pos = []
y_pos = []
node_comms = []
for pos in node_pos:
    node_comms.append(pos[4])
    x_pos.append(pos[1])
    y_pos.append(pos[2])

node_comms = np.array(node_comms)
x_pos = np.array(x_pos)
y_pos = np.array(y_pos)
degrees = np.sum(adj_numpy_sym_lcc, axis=0)
WARNING:graspologic.layouts.auto:Directed graph converted to undirected graph for community detection
plot = networkplot(
    adjacency=adj_numpy_sym_lcc,
    x=x_pos,
    y=y_pos,
    node_hue=node_comms,
    palette="tab20",
    node_size=degrees,
    node_sizes=(20, 200),
    edge_hue="source",
    edge_alpha=0.5,
    edge_linewidth=0.5,
)
plt.savefig(folder / "umap_connectome.png")
plt.show()
_images/make_fig1_12_0.png

We are now interested in displaying the statistics from the graph we generated to that of the statistics given in table 1 of the paper, and we wish to make sure that our numbers are approximately the same

#given the adjacency matrix and a partition of the nodes return the modularity score
def modularity_from_adjacency(sym_adj, partition, resolution=1):
    if isinstance(partition, dict):
        partition_labels = np.vectorize(partition.__getitem__)(
            np.arange(sym_adj.shape[0])
        )
    else:
        partition_labels = partition
    partition_labels = np.array(partition_labels)
    in_comm_mask = partition_labels[:, None] == partition_labels[None, :]
    degrees = np.squeeze(np.asarray(sym_adj.sum(axis=0)))
    degree_prod_mat = np.outer(degrees, degrees) / sym_adj.sum()
    mod_mat = sym_adj - resolution * degree_prod_mat
    return mod_mat[in_comm_mask].sum() / sym_adj.sum()
#get original networkx graph (not lcc)
adj_nx_orig = nx.from_numpy_array(adj_numpy_orig, create_using=nx.DiGraph)

#stats on original graph
n_nodes_orig = adj_nx_orig.number_of_nodes()
n_edges_orig = adj_nx_orig.number_of_edges()
dens_orig = density(adj_nx_orig)

#cannot calc modularity because there are isolate nodes
mod_score_orig = "N/A"

avg_degree_orig = np.average(np.count_nonzero(adj_numpy_orig, axis=0))
avg_degree_orig
4.234237536656892
#get non-symmetrized lcc networkx graph
adj_nx_lcc = nx.from_numpy_array(adj_numpy_lcc, create_using=nx.DiGraph)

#stats on lcc adj
n_nodes_lcc = adj_nx_lcc.number_of_nodes()
n_edges_lcc = adj_nx_lcc.number_of_edges()
dens_lcc = density(adj_nx_lcc)

#make symmetric when calc mod
sym_adj_lcc = symmetrize(adj_numpy_lcc)
mod_score_lcc = modularity_from_adjacency(sym_adj_lcc, node_comms)

avg_degree_lcc = np.average(np.count_nonzero(adj_numpy_lcc, axis=0))
#paper stats
n_nodes_paper = 2728
n_edges_paper = 11437
dens_paper = 0.0015
mod_score_paper = 0.62
avg_degree_paper = 4.1
stats = {"Number of nodes": [n_nodes_orig, n_nodes_lcc, n_nodes_paper], "Number of edges": [n_edges_orig, n_edges_lcc, n_edges_paper], "Graph density": [dens_orig, dens_lcc, dens_paper], "Modularity": [mod_score_orig, mod_score_lcc, mod_score_paper], "Avg. degree": [avg_degree_orig, avg_degree_lcc, avg_degree_paper]}
stats_df = pd.DataFrame.from_dict(stats, orient="index", columns=["Orig adj", "LCC adj", "Paper"])
stats_df.to_csv(folder / "fig_1_stats.csv")
stats_df
Orig adj LCC adj Paper
Number of nodes 2728 2723.000000 2728.0000
Number of edges 11551 11551.000000 11437.0000
Graph density 0.001553 0.001558 0.0015
Modularity N/A 0.647185 0.6200
Avg. degree 4.234238 4.242012 4.1000

Looks like the values of our statistics very closely match those of the paper!