knitr::opts_chunk$set(cache=TRUE, autodep=TRUE)
dep_auto() # figure out dependencies automatically

## Warning in parse_objects(paths[1L]): file hkmeans_cache/html/__objects not
## found

## Warning in parse_objects(paths[2L]): file hkmeans_cache/html/__globals not
## found

opts_chunk$set(cache=FALSE,echo=TRUE,warning=FALSE,message=FALSE,comment="#",
               fig.path='../Figures/hkmeans_figure/', dpi=100,dev=c('png','pdf'))

opts_knit$set(aliases=c(h='fig.height', w='fig.width', cap='fig.cap', scap='fig.scap'))                                                                               
opts_knit$set(eval.after = c('fig.cap','fig.scap'))                                                                            
knit_hooks$set(document = function(x) {
  gsub('(\\\\end\\{knitrout\\}[\n]+)', '\\1\\\\noindent ', x)                                                   })
#opts_knit$set(animation.fun = hook_scianimator)

knit_hooks$set(plot = function(x, options) {
       paste('<figure><img src="',
             opts_knit$get('base.url'), paste(x, collapse = '.'),
             '"><figcaption>', options$fig.cap, '</figcaption></figure>',
             sep = '')
 })
 
 fn = local({
   i = 0
   function(x) {
     i <<- i + 1
     paste('Figure ', i, ': ', x, sep = '')
   }
 })

 fig <- local({
    i <- 0
    ref <- list()
    list(
        cap=function(refName, text) {
            i <<- i + 1
            ref[[refName]] <<- i
            paste("<b>Figure ", i, ": ", text, "</b><br><br>", sep="")
        },
        ref=function(refName) {
            ref[[refName]]
        })
})

Homepage
The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.

1 Introduction

On Fri, Dec 11, 2015 at 11:53 AM, joshua vogelstein jovo@jhu.edu wrote:
we will get n=10^6 points, each in d=25 dimensions.
i want to hierarchically cluster them, in a ways:

recursive k-means on the data, maybe 5 levels
compute approximate k-neighbors, svd in d=dimensions, and then #1
maybe some other ways.

2 Data

The corresponds to 24 channels x 6 features per synapse, ordered like c0f0,c0f1,c0f2,c0f3,c0f4,c0f5,c1f0,c1f1… etc

f0 = integrated brightness
f1 = local brightness
f2 = distance to Center of Mass
f3 = moment of inertia around synapsin maxima
f4,f5 are features that I forget what they are.. would need to ask brad.
i would throw them out, I did so in my kohonen code (which you have, its in matlab).

and

On Feb 8, 2016, at 2:00 PM, Kristina Micheva kmicheva@stanford.edu wrote:

Excitatory presynaptic: ‘Synap’, ‘Synap’, ‘VGlut1’, ‘VGlut1’, ‘VGlut2’,
Excitatory postsynaptic: ‘psd’, ‘glur2’, ‘nmdar1’, ‘nr2b’, ‘NOS’, ‘Synapo’ (but further away than PSD, gluR2, nmdar1 and nr2b)
Inhibitory presynaptic: ‘gad’, ‘VGAT’, ‘PV’,
Inhibitory postsynaptic: ‘Gephyr’, ‘GABAR1’, ‘GABABR’, ‘NOS’,
At a very small number of inhibitory: ‘Vglut3’ (presynaptic), ‘CR1’(presynaptic),
Other synapses:‘5HT1A’, ‘TH’, ‘VACht’,
Not at synapses: ‘tubuli’, ‘DAPI’.

and

On March 10, 2016, 00:29:04 (UTC), Kristina Micheva kmicheva@stanford.edu wrote:

There are 2 different Synap channels (2 different antibodies were used), so that part is fine. And 2 different VGluT1 channels (same antibody but done at different times) The NOS channel is the same, so count it as one even though it appears twice. It is listed two times because it can be found at both excitatory and inhibitory synapses. This is where your count of 25 comes, even though there are 24 channels. I would also add the 2 Synap channels to the Inhibitory presynaptic category - there is supposed to be synapsin there, but at lower levels compared to excitatory presynaptic category.

Note: The order of the channels are given by line 227 in the kohenen.m file which can be found in the dropbox folder.
Synap and Synap have been augmented to Synap_1 and Synap_2 for clarity.
VGlut1 and VGlut1 have been augmented to VGlut1_t1 and VGlut1_t2 to distinguish between the different times (which are unknown).

feat <- fread("../Data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
dim(feat)

# [1] 1119299     144

channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
              'psd','glur2','nmdar1','nr2b','gad','VGAT',
              'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
              'NOS','TH','VACht','Synapo','tubuli','DAPI')
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
                  'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
                  'in.pre','in.post','in.post','in.post','in.pre.small','other',
                  'ex.post','other','other','ex.post','none','none')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
fchannel <- as.numeric(factor(channel.type,
    levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
    ))
ford <- order(fchannel)
#ccol <- rainbow(max(fchannel))[fchannel]
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","mediumblue","gold")
ccol <- Syncol[fchannel]


fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
#mycol <- colorpanel(100,"blue","grey","red")
#mycol <- redgreen(100)
#mycol <- colorpanel(100, "purple", "black", "blue")
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)

2.1 Transformations

Each of the channels has an arbitrary independent linear transformation and thus converting to z-scores or quantiles is required.

Selecting only the f0 features, we will use both the raw data and a transformed version, transforming by adding 1 and taking the log base 10 for each column and scaling.

f0 <- seq(1,ncol(feat),by=nfeat)
featF0 <- subset(feat, select=f0)

#featF0s <- scale(featF0, scale=TRUE, center=TRUE)
#flog <- apply(X=(featF0+1),2,log10)
#f01 <- data.table(apply(X=featF0, 2, function(x){((x-min(x))/(max(x)-min(x)))}))

#qs <- apply(f01, 2, quantile, probs=c(.025,.5,.75,.99))

### Tunrcating to the inner 98%
tmpG <- featF0[,lapply(.SD,function(x){x >= quantile(x,prob=.01)})]
tmpL <- featF0[,lapply(.SD,function(x){x <= quantile(x,prob=1 - 0.01)})]
ind <- apply(tmpG,1,all)
ind2 <- apply(tmpL,1,all)
fil98F0 <- featF0[ind & ind2,]

### taking log_10
flog <- log10(fil98F0)
fs <- scale(flog, center=TRUE, scale=TRUE)

### scaling between [0,1]
f01 <- data.table(apply(X=flog, 2, function(x){((x-min(x))/(max(x)-min(x)))}))

The data

2.1.1 Kernel Density Estimates of the marginals

df <- melt(as.matrix(flog))
names(df) <- c("ind","channel","value")
ts <- 22
gg1 <- ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    scale_fill_manual(values=ccol) +
    geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=100) +
    geom_density(aes(group=channel, color=channel),size=1.5) +
    facet_wrap(~ channel, scale='fixed') +
    theme(plot.title=element_text(size=ts),
          axis.title.x=element_text(size=ts),
          axis.title.y=element_text(size=ts),
          legend.title=element_text(size=ts),
          legend.text=element_text(size=ts-2),
          axis.text=element_text(size=ts-2),
          strip.text=element_text(size=ts))+
    ggtitle("Kernel Density Estimates of f01 transformed data ")

print(gg1)

**Figure 1: Kernel density estimates for each channel, on scaled data.**

df <- melt(as.matrix(f01))
names(df) <- c("ind","channel","value")
ts <- 22
gg2 <- ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    scale_fill_manual(values=ccol) +
    geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=21) +
    geom_density(aes(group=channel, color=channel),size=1.5) +
    facet_wrap(~ channel, scale='fixed') +
    theme(plot.title=element_text(size=ts),
          axis.title.x=element_text(size=ts),
          axis.title.y=element_text(size=ts),
          legend.title=element_text(size=ts),
          legend.text=element_text(size=ts-2),
          axis.text=element_text(size=ts-2),
          strip.text=element_text(size=ts))+
    ggtitle("Kernel Density Estimates of f01 transformed data ")

print(gg2)

**Figure 2: Kernel density estimates for each channel, on f01 data.**

df <- melt(as.matrix(flog))
names(df) <- c("ind","channel","value")
gg3 <- ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    geom_density(aes(group=channel, colour=channel))+
    facet_wrap( ~ channel, scale='free')+
    ggtitle("Kernel Density Estimates on log transformed data.")

print(gg3)

df <- melt(as.matrix(fs))
names(df) <- c("ind","channel","value")
ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    geom_density(aes(group=channel, colour=channel))

**Figure 3: Kernel density estimates for each channel, on log transformed scaled data.**

2.2 K-Means Level 1

** Note that a seed is being set for the random initialization of K-means. **

K1 <- 12  ## Set the upperbound for k-means.

## Run kmeans on the untransformed data
kvecF0 <- foreach(i = 1:K1) %dopar% {
    set.seed(2^13 - 1)
    kmeans(featF0,centers=i)
}

## Run kmeans on the 
## log \circ scale transformed data
kvecfs <- foreach(i = 1:K1) %dopar% {
    set.seed(2^13 - 1)
    kmeans(fs,centers=i)
}

Here we calculate the bic from a user defined function and pick the maximum.

bicF0 <- kbic(kvecF0)
bicfs1 <- kbic(kvecfs)
mxF0 <- t(c(which(bicF0 == max(bicF0)), max(bicF0)))
mxfs1 <- t(c(which(bicfs1 == max(bicfs1)), max(bicfs1)))

par(mfrow=c(1,2))
plot(bicF0, type='b', main="BIC on raw data.")
points(mxF0[,1], mxF0[,2], col='red', pch=20)
plot(bicfs1, type='b', main="BIC on scaled data.")
points(mxfs1[,1], mxfs1[,2], col='red', pch=20)

2.2.1 Heat maps: Untransformed Data

For the following we manualy choose 2 clusters.

## Formatting data for heatmap
feat2 <- aggregate(featF0,by=list(lab=kvecF0[[2]]$cluster),FUN=mean)
feat2 <- as.matrix(feat2[,-1])

rownames(feat2) <- clusterFraction(kvecF0[[2]])

ford <- order(fchannel)

heatmap.2(as.matrix(feat2), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

#  [1] "gold"       "#197300"    "#cc0000"    "mediumblue" "#ff9933"   
#  [6] "#660000"    "#cc0000"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#660000"    "#5ed155"    "#cc0000"    "#ff9933"    "mediumblue"
# [16] "#660000"    "mediumblue" "gold"       "#5ed155"    "#5ed155"   
# [21] "#197300"    "#197300"    "#197300"    "#197300"

**Figure 4: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to hclust.**

heatmap.2(as.matrix(feat2[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)

#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"

**Figure 5: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to synapse type.**

2.2.2 Heat maps: Transformed Data

## Formatting data for heatmap
feat2fs <- aggregate(fs,by=list(lab=kvecfs[[2]]$cluster),FUN=mean)
feat2fs <- as.matrix(feat2fs[,-1])

rownames(feat2fs) <- clusterFraction(kvecfs[[2]])
ford <- order(fchannel)

heatmap.2(as.matrix(feat2fs), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

#  [1] "#197300"    "#197300"    "#197300"    "#5ed155"    "#cc0000"   
#  [6] "#197300"    "#5ed155"    "#5ed155"    "#5ed155"    "mediumblue"
# [11] "mediumblue" "mediumblue" "gold"       "#ff9933"    "#cc0000"   
# [16] "#ff9933"    "#197300"    "#5ed155"    "#5ed155"    "#cc0000"   
# [21] "#660000"    "gold"       "#660000"    "#660000"

**Figure 6: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to hclust.**

heatmap.2(as.matrix(feat2fs[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)

#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"

**Figure 7: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to synapse type.**

2.3 K-Means Level 2

synOnly<-channel.type!= "none" & channel.type!="other"

fs11 <- fs[kvecfs[[2]]$cluster==1,][,synOnly]
fs12 <- fs[kvecfs[[2]]$cluster==2,][,synOnly]


dim(fs11)

# [1] 423759     19

dim(fs12)

# [1] 379427     19

## Run kmeans on the untransformed data
kvecL21synOnly <- foreach(i = 1:K1) %dopar% {
    set.seed(2^13 - 1)
    kmeans(fs11,centers=i)
}

2.4 PAMK: Level 1

The pamk function performs something similar to K-means.

pamout <- pamk(fs, krange=1:K1,usepam=FALSE,critout=FALSE)
labs <- pamout$pamobject$clustering

plot(pamout$crit,type="b")

**Figure 8: Average silhouette width plot of the level 1 clustering.**

feat5 <- aggregate(fs,by=list(lab=pamout$pamo$clustering),FUN=mean)

feat5 <- as.matrix(feat5[,-1])
rownames(feat5) <- clusterFraction(labs)

### Reordering based on synapse type.
ford <- order(fchannel)

heatmap.2(as.matrix(feat5), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

#  [1] "mediumblue" "mediumblue" "gold"       "#cc0000"    "#ff9933"   
#  [6] "#ff9933"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [11] "#5ed155"    "gold"       "#5ed155"    "#5ed155"    "#5ed155"   
# [16] "#5ed155"    "mediumblue" "#197300"    "#cc0000"    "#5ed155"   
# [21] "#197300"    "#197300"    "#197300"    "#197300"

heatmap.2(as.matrix(feat5[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)

#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"

2.5 PAMK: Level 2

2.5.1 PAMK: Level 2.1

We begin by splitting the data into the two clusters as calculated above.

fs1 <- fs[pamout$pamobject$clustering==1,]
fs2 <- fs[pamout$pamobject$clustering==2,]

pamout21 <- pamk(fs1, krange=1:K1,usepam=FALSE,critout=FALSE)
labs21 <- pamout21$pamobject$clustering

plot(pamout21$crit,type="b")

**Figure 11: Average silhouette width plot of the level 2 clustering.**

agg21 <- aggregate(fs1,by=list(lab=pamout21$pamo$clustering),FUN=mean)
agg21 <- as.matrix(agg21[,-1])

cs <- paste(paste0("C", 1:pamout21$nc),
   foreach(i =1:pamout21$nc,.combine=c)%do%{(sprintf("%.2g",sum(labs21==i)/length(labs21)))})
    
rownames(agg21) <- cs

heatmap.2(as.matrix(agg21), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

#  [1] "#197300"    "#197300"    "#5ed155"    "#cc0000"    "#197300"   
#  [6] "#197300"    "mediumblue" "mediumblue" "#ff9933"    "#5ed155"   
# [11] "mediumblue" "#197300"    "#5ed155"    "gold"       "#ff9933"   
# [16] "#5ed155"    "#5ed155"    "#660000"    "#660000"    "#660000"   
# [21] "#cc0000"    "#cc0000"    "#5ed155"    "gold"

heatmap.2(as.matrix(agg21[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)

#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"

2.5.2 PAMK: Level 2.2

pamout22 <- pamk(fs2, krange=1:K1,usepam=FALSE,critout=FALSE)
labs22 <- pamout22$pamobject$clustering

plot(pamout22$crit,type="b")

**Figure 14: Average silhouette width plot of the level 2 clustering.**

agg22 <- aggregate(fs2,by=list(lab=pamout22$pamo$clustering),FUN=mean)
agg22 <- as.matrix(agg22[,-1])

cs <- paste(paste0("C", 1:pamout22$nc),
   foreach(i =1:pamout22$nc,.combine=c)%do%{(sprintf("%.2g",sum(labs22==i)/length(labs22)))})
    
rownames(agg22) <- cs

heatmap.2(as.matrix(agg22), trace="none",col=rev(mycol),colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #

#  [1] "mediumblue" "mediumblue" "#660000"    "#660000"    "#197300"   
#  [6] "#197300"    "#660000"    "#cc0000"    "gold"       "mediumblue"
# [11] "#197300"    "#5ed155"    "#197300"    "#197300"    "#5ed155"   
# [16] "#5ed155"    "#5ed155"    "#5ed155"    "#cc0000"    "#cc0000"   
# [21] "gold"       "#ff9933"    "#5ed155"    "#ff9933"

heatmap.2(as.matrix(agg22[,ford]),dendrogram='row',Colv=NA,trace="none", col=rev(mycol),colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)

#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"

tmp <- as.numeric(table(fchannel))
cmat <- cor(fs)
corrplot(cmat[ford,ford],method="color",tl.col=ccol[ford])
corrRect(tmp,col=Syncol,lwd=4)

**Figure 17: Correlation on log º scaled data, reordered by synapse type.**

3 Exploring pair-wise relationships

set.seed(2^13)
s1 <- sample(1e6,1e4)
dm <- as.matrix(f01[s1,])

c1 <- combn(24,2)
d2 <- foreach(i = 1:dim(c1)[2],.combine=rbind)%do%{
    colset1 <- as.matrix(dm[,c1[,i]])
    colnames(colset1) <- NULL
    l <- paste(channel[c1[,i]][2],channel[c1[,i][1]],sep='~')
        
    data.frame(colset1,l,check.names=FALSE)
}

colnames(d2) <- c("x", "y", "g")

p1 <- xyplot(y ~ x | g, data=d2, 
       panel=function(x,y,...){
           panel.xyplot(x,y,...)
           fit <- lm(y ~ x)
           panel.lines(x,fitted(fit), col.line='red', lwd=2)
       })

print(p1)
p2 <- xyplot(y ~ x | g, data=d2, type=c('p', 'smooth'), col.line='red',lwd=3,pch='.',scales = list(y = list(relation = "free"),x = list(relation = "free")))

pdf('~/Desktop/tmp1.pdf', height=40,width=60) 
print(p1) 
dev.off()

tmp <- as.matrix(f01[s1,])
plot(Synap_2F0 ~ Synap_1F0, data=f01)
chull(tmp)
polygon(tmp[chull(tmp),])

bagplot(tmp[,c(1,2)])

Synapse Clustering

JLP

2016-04-21