The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.

1 Introduction

2 Data

The corresponds to 24 channels x 6 features per synapse, ordered like c0f0,c0f1,c0f2,c0f3,c0f4,c0f5,c1f0,c1f1… etc

f0 = integrated brightness
f1 = local brightness
f2 = distance to Center of Mass
f3 = moment of inertia around synapsin maxima
f4,f5 are features that I forget what they are.. would need to ask brad.
i would throw them out, I did so in my kohonen code (which you have, its in matlab).

and

On Feb 8, 2016, at 2:00 PM, Kristina Micheva kmicheva@stanford.edu wrote:

Excitatory presynaptic: ‘Synap’, ‘Synap’, ‘VGlut1’, ‘VGlut1’, ‘VGlut2’,
Excitatory postsynaptic: ‘psd’, ‘glur2’, ‘nmdar1’, ‘nr2b’, ‘NOS’, ‘Synapo’ (but further away than PSD, gluR2, nmdar1 and nr2b)
Inhibitory presynaptic: ‘gad’, ‘VGAT’, ‘PV’,
Inhibitory postsynaptic: ‘Gephyr’, ‘GABAR1’, ‘GABABR’, ‘NOS’,
At a very small number of inhibitory: ‘Vglut3’ (presynaptic), ‘CR1’(presynaptic),
Other synapses:‘5HT1A’, ‘TH’, ‘VACht’,
Not at synapses: ‘tubuli’, ‘DAPI’.

and

On March 10, 2016, 00:29:04 (UTC), Kristina Micheva kmicheva@stanford.edu wrote:

There are 2 different Synap channels (2 different antibodies were used), so that part is fine. And 2 different VGluT1 channels (same antibody but done at different times) The NOS channel is the same, so count it as one even though it appears twice. It is listed two times because it can be found at both excitatory and inhibitory synapses. This is where your count of 25 comes, even though there are 24 channels. I would also add the 2 Synap channels to the Inhibitory presynaptic category - there is supposed to be synapsin there, but at lower levels compared to excitatory presynaptic category.

Note: The order of the channels are given by line 227 in the kohenen.m file which can be found in the dropbox folder.
Synap and Synap have been augmented to Synap_1 and Synap_2 for clarity.
VGlut1 and VGlut1 have been augmented to VGlut1_t1 and VGlut1_t2 to distinguish between the different times (which are unknown).

feat <- fread("../Data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
dim(feat)

# [1] 1119299     144

### Setting a seed and creating an index vector
### to select half of the data
set.seed(2^13)
half1 <- sample(dim(feat)[1],dim(feat)[1]/2)

feat <- feat[half1,]
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
              'psd','glur2','nmdar1','nr2b','gad','VGAT',
              'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
              'NOS','TH','VACht','Synapo','tubuli','DAPI')

## NOS as ex.post
#channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
#                  'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
#                  'in.pre','in.post','in.post','in.post','in.pre.small','other',
#                  'ex.post','other','other','ex.post','none','none')
#
## NOS as in.post
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
                  'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
                  'in.pre','in.post','in.post','in.post','in.pre.small','other',
                  'in.post','other','other','ex.post','none','none')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
fchannel <- as.numeric(factor(channel.type,
    levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
    ))
ford <- order(fchannel)
#ccol <- rainbow(max(fchannel))[fchannel]
Syncol <- c("#197300","#5ed155","#990000","#cc0000","#ff9933","mediumblue","gold")
ccol <- Syncol[fchannel]

exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","mediumblue");
exCol <- as.character(exCol)

fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)

2.1 Transformations

Each of the channels has an arbitrary independent linear transformation and thus converting to z-scores or quantiles is required.

Selecting only the f0 features, we will use both the feature vector and a transformed version, transforming by adding 1 and taking the log base 10 for each column and scaling.

f0 <- seq(1,ncol(feat),by=nfeat)
fRaw <- subset(feat, select=f0)
#fLog <- apply(X=(fRaw+1),2,log10)
#fLogScaled <- scale(flog, center=TRUE, scale=TRUE)

eps <- 2*.Machine$double.eps
f01 <- apply(X=fRaw, 2, function(x){((x-min(x))/(max(x)-min(x)))})
fRank <- apply(fRaw,2,rank,ties.method='average')
fLog <- apply(f01,2,function(x){ log10(x+eps)})

3 Correlations

3.1 Raw

tmp <- as.numeric(table(fchannel))
cmatRaw <- cor(fRaw)
corrplot(cmatRaw[ford,ford],method="color",tl.col=ccol[ford])
corrRect(tmp,col=Syncol,lwd=4)

**Figure 1: Correlation on raw data, reordered by synapse type.**

3.2 Log

cmatLog <- cor(fLog)
corrplot(cmatLog[ford,ford],method="color",tl.col=ccol[ford])
corrRect(tmp,col=Syncol,lwd=4)

**Figure 2: Correlation on log data, reordered by synapse type.**

3.3 Rank

cmatRank <- cor(fRank)
corrplot(cmatRank[ford,ford],method="color",tl.col=ccol[ford])
corrRect(tmp,col=Syncol,lwd=4)

**Figure 3: Correlation on rank data, reordered by synapse type.**

4 PCA and Scree Plots on the Correlation Matrices

Note that centering and scaling are set to FALSE.

pcaRaw <- prcomp(cmatRaw,scale=FALSE, center=FALSE)
elRaw <- getElbows(pcaRaw$sdev, plot=FALSE)

pcaLog <- prcomp(cmatLog,scale=FALSE, center=FALSE)
elLog <- getElbows(pcaLog$sdev, plot=FALSE)

pcaRank <- prcomp(cmatRank,scale=FALSE, center=FALSE)
elRank <- getElbows(pcaRank$sdev, plot=FALSE)

par(mfrow=c(1,3))
plot(pcaRaw$sdev, type='b'); title("Scree plot: Raw Cor.  Matrix")
points(cbind(elRaw,pcaRaw$sdev[elRaw]), col='red', pch=20)
plot(pcaLog$sdev, type='b'); title("Scree plot: Log Cor. Matrix")
points(cbind(elLog,pcaLog$sdev[elLog]), col='red', pch=20)
plot(pcaRank$sdev, type='b'); title("Scree plot: Rank Cor. Matrix")
points(cbind(elRank,pcaRank$sdev[elRank]), col='red', pch=20)

5 PAMK

Running pamk for \(K = 7\).

5.1 PAMK on Raw data

K <- 7
pamRaw7 <- pamk(pcaRaw$x[,1:elRaw[2]],krange=K,usepam=FALSE,critout=FALSE)

aggRaw <- aggregate(pcaRaw$x[,1:elRaw[2]],by=list(lab=pamRaw7$pamo$clustering),FUN=mean)

aggRaw <- as.matrix(aggRaw[,-1])
rownames(aggRaw) <- clusterFraction(pamRaw7$pamo$clustering)

heatmap.2(as.matrix(aggRaw), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

# NULL

5.2 PAMK on Log Data

pamLog7 <- pamk(pcaLog$x[,1:elLog[2]],krange=K,usepam=FALSE,critout=FALSE)
#plot(pamLog5$crit, type='b')

aggLog <- aggregate(pcaLog$x[,1:elLog[2]],by=list(lab=pamLog7$pamo$clustering),FUN=mean)

aggLog <- as.matrix(aggLog[,-1])
rownames(aggLog) <- clusterFraction(pamLog7$pamo$clustering)

heatmap.2(as.matrix(aggLog), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

# NULL

5.3 PAMK on Ranked Data

pamRank7 <- pamk(pcaRank$x[,1:elRank[2]],krange=K,usepam=FALSE,critout=FALSE)
#plot(pamRank5$crit, type='b')

aggRank <- aggregate(pcaRank$x[,1:elRank[2]],by=list(lab=pamRank7$pamo$clustering),FUN=mean)

aggRank <- as.matrix(aggRank[,-1])
rownames(aggRank) <- clusterFraction(pamRank7$pamo$clustering)

heatmap.2(as.matrix(aggRank), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

# NULL

5.4 ARI

We are considering the channel types to be the “ground truth” here.

truth <- as.numeric(factor(channel.type))

Calculating the ARI for the raw data.

xRaw <- as.numeric(pamRaw7$pamo$clustering)
ariRaw <- foreach(i = 1:1e4, .combine='c') %dopar%{
    y <- sample(xRaw)
    mclust::adjustedRandIndex(truth,y)
    }

xLog <- as.numeric(pamLog7$pamo$clustering)
ariLog <- foreach(i = 1:1e4, .combine='c') %dopar%{
    y <- sample(xLog)
    mclust::adjustedRandIndex(truth,y)
    }

xRank <- as.numeric(pamRank7$pamo$clustering)
ariRank <- foreach(i = 1:1e4, .combine='c') %dopar%{
    y <- sample(xRank)
    mclust::adjustedRandIndex(truth,y)
    }

par(mfrow=c(1,3))
hist(ariRaw)
abline(v = adjustedRandIndex(truth,xRaw), lwd=2, col='red')
hist(ariLog)
abline(v = adjustedRandIndex(truth,xLog), lwd=2, col='red')
hist(ariRank)
abline(v = adjustedRandIndex(truth,xRank), lwd=2, col='red')

adjustedRandIndex(truth, xRaw)

# [1] 0.250379

adjustedRandIndex(truth, xLog)

# [1] 0.1884301

adjustedRandIndex(truth, xRank)

# [1] 0.287806

5.5 ARI vs Embedding dimension.

Here we investigeta how ARI varies with the embedding dimension.

5.5.1 Functions

avd <- function(pcaObj,elbow,truth){
###
### Given a pca object with an elbow as upperbound
### compute the ARI of truth versus the pamk clustering for each
### dimension from 1:elbow
###
require(foreach)

out <- foreach(i = 1:elbow, .combine="rbind") %do% {
    pmk <- pamk(pcaObj$x[,1:i], krange=K, usepam=FALSE, critout=FALSE)
    data.frame(Embedding_Dimension=i,ARI=adjustedRandIndex(truth, pmk$pamo$clustering))
    }
return(out)
}

5.5.2 ARI Plots

avdRaw <- avd(pcaRaw, elRaw[2], truth)
avdLog <- avd(pcaLog, elLog[2], truth)
avdRank <- avd(pcaRank, elRank[2], truth)

avdRaw$group <- "Raw"
avdLog$group <- "Log"
avdRank$group <- "Rank"

avdDat <- data.frame(rbind(avdRaw, avdLog, avdRank))
#par(mfrow=c(1,3))
#plot(avdRaw, type='b', main="Raw Data")
#plot(avdLog, type='b', main="Log Data")
#plot(avdRank, type='b', main="Rank Data")

p1 <- ggplot(data=avdDat, aes(x=Embedding_Dimension, y=ARI,group=group,color=group)) + 
        geom_line(size=1.5) + geom_point(size=3) +
            theme(axis.text=element_text(size=18)) + ggtitle("ARI vs. DIM")
print(p1)

6 Plots of embeddings

First let’s look at the 3-d embeddings and make some decisions from there.

pairs(pcaRaw$x[,1:3],col=exCol,pch=as.numeric(exType)+15,cex=2,main="Raw")

pairs(pcaLog$x[,1:3],col=exCol, pch=as.numeric(exType)+15,cex=2,main="Log")

pairs(pcaRank$x[,1:3],col=exCol,pch=as.numeric(exType)+15,cex=2,main="Rank")

pca <- pcaRaw$x[,1:3]
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=1, main="Raw")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5))
subid <- currentSubscene3d()
rglwidget(elementId="plot3dRaw")

pca <- pcaLog$x
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=1, main="Log")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5))
subid <- currentSubscene3d()
rglwidget(elementId="plot3dLog")

pca <- pcaRank$x[,1:3]
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=3, main="Rank")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5))
subid <- currentSubscene3d()
rglwidget(elementId="plot3dRank")

7 Pick \(K=3\).

7.1 PAMK=3 on Raw data

K <- 3
pamRaw3 <- pamk(pcaRaw$x[,1:elRaw[2]],krange=K,usepam=FALSE,critout=FALSE)

agg3Raw <- aggregate(pcaRaw$x[,1:elRaw[2]],by=list(lab=pamRaw3$pamo$clustering),FUN=mean)

agg3Raw <- as.matrix(agg3Raw[,-1])
rownames(agg3Raw) <- clusterFraction(pamRaw3$pamo$clustering)

heatmap.2(as.matrix(agg3Raw), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

# NULL

7.2 PAMK=3 on Log Data

pamLog3 <- pamk(pcaLog$x[,1:elLog[2]],krange=K,usepam=FALSE,critout=FALSE)
#plot(pamLog5$crit, type='b')

agg3Log <- aggregate(pcaLog$x[,1:elLog[2]],by=list(lab=pamLog3$pamo$clustering),FUN=mean)

agg3Log <- as.matrix(agg3Log[,-1])
rownames(agg3Log) <- clusterFraction(pamLog3$pamo$clustering)

heatmap.2(as.matrix(agg3Log), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

# NULL

7.3 PAMK=3 on Ranked Data

pamRank3 <- pamk(pcaRank$x[,1:elRank[2]],krange=K,usepam=FALSE,critout=FALSE)
#plot(pamRank5$crit, type='b')

agg3Rank <- aggregate(pcaRank$x[,1:elRank[2]],by=list(lab=pamRank3$pamo$clustering),FUN=mean)

agg3Rank <- as.matrix(agg3Rank[,-1])
rownames(agg3Rank) <- clusterFraction(pamRank3$pamo$clustering)

heatmap.2(as.matrix(agg3Rank), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

# NULL

7.4 ARI Plots for \(K=3\)

avd3Raw <- avd(pcaRaw,elRaw[2],truth)
avd3Log <- avd(pcaLog,elLog[2],truth)
avd3Rank <- avd(pcaRank,elRank[2],truth)

avd3Raw$group <- "Raw"
avd3Log$group <- "Log"
avd3Rank$group <- "Rank"

avd3Dat <- data.frame(rbind(avd3Raw, avd3Log, avd3Rank))

p2 <- ggplot(data=avd3Dat, aes(x=Embedding_Dimension, y=ARI,group=group,color=group)) + 
        geom_line(size=1.5) + geom_point(size=3) +
            theme(axis.text=element_text(size=18)) + ggtitle("ARI vs. DIM")
print(p2)

pca <- pcaRaw$x[,1:3]
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=1, main="Raw")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5))
subid <- currentSubscene3d()
rglwidget(elementId="plot3dRaw")
open3d()
plot3d( cube3d(col = "green") )
M <- par3d("userMatrix")
if (!rgl.useNULL())
  play3d( par3dinterp(time = (0:2)*4, userMatrix = list(M,
                                     rotate3d(M, pi/2, 1, 0, 0),
                                     rotate3d(M, pi/2, 0, 0, 1) ) ), 
        duration = 18 )
 
movie3d( spin3d(), duration = 5)
subid <- currentSubscene3d()
rglwidget(elementId="plot3dRank")

Synapse Clustering: Random Half

JLP

2016-04-21