knitr::opts_chunk$set(cache=TRUE, autodep=TRUE)
dep_auto() # figure out dependencies automatically
## Warning in parse_objects(paths[1L]): file hkmeans_cache/html/__objects not
## found
## Warning in parse_objects(paths[2L]): file hkmeans_cache/html/__globals not
## found
opts_chunk$set(cache=FALSE,echo=TRUE,warning=FALSE,message=FALSE,comment="#",
               fig.path='../Figures/hkmeans_figure/', dpi=100,dev=c('png','pdf'))

opts_knit$set(aliases=c(h='fig.height', w='fig.width', cap='fig.cap', scap='fig.scap'))                                                                               
opts_knit$set(eval.after = c('fig.cap','fig.scap'))                                                                            
knit_hooks$set(document = function(x) {
  gsub('(\\\\end\\{knitrout\\}[\n]+)', '\\1\\\\noindent ', x)                                                   })
#opts_knit$set(animation.fun = hook_scianimator)

knit_hooks$set(plot = function(x, options) {
       paste('<figure><img src="',
             opts_knit$get('base.url'), paste(x, collapse = '.'),
             '"><figcaption>', options$fig.cap, '</figcaption></figure>',
             sep = '')
 })
 
 fn = local({
   i = 0
   function(x) {
     i <<- i + 1
     paste('Figure ', i, ': ', x, sep = '')
   }
 })

 fig <- local({
    i <- 0
    ref <- list()
    list(
        cap=function(refName, text) {
            i <<- i + 1
            ref[[refName]] <<- i
            paste("<b>Figure ", i, ": ", text, "</b><br><br>", sep="")
        },
        ref=function(refName) {
            ref[[refName]]
        })
})

Homepage
The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.

1 Introduction

On Fri, Dec 11, 2015 at 11:53 AM, joshua vogelstein jovo@jhu.edu wrote:
we will get n=10^6 points, each in d=25 dimensions.
i want to hierarchically cluster them, in a ways:

  1. recursive k-means on the data, maybe 5 levels
  2. compute approximate k-neighbors, svd in d=dimensions, and then #1
  3. maybe some other ways.

2 Data

The corresponds to 24 channels x 6 features per synapse, ordered like c0f0,c0f1,c0f2,c0f3,c0f4,c0f5,c1f0,c1f1โ€ฆ etc

f0 = integrated brightness
f1 = local brightness
f2 = distance to Center of Mass
f3 = moment of inertia around synapsin maxima
f4,f5 are features that I forget what they are.. would need to ask brad.
i would throw them out, I did so in my kohonen code (which you have, its in matlab).

and

On Feb 8, 2016, at 2:00 PM, Kristina Micheva kmicheva@stanford.edu wrote:

and

On March 10, 2016, 00:29:04 (UTC), Kristina Micheva kmicheva@stanford.edu wrote:

There are 2 different Synap channels (2 different antibodies were used), so that part is fine. And 2 different VGluT1 channels (same antibody but done at different times) The NOS channel is the same, so count it as one even though it appears twice. It is listed two times because it can be found at both excitatory and inhibitory synapses. This is where your count of 25 comes, even though there are 24 channels. I would also add the 2 Synap channels to the Inhibitory presynaptic category - there is supposed to be synapsin there, but at lower levels compared to excitatory presynaptic category.

feat <- fread("../Data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
dim(feat)
# [1] 1119299     144
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
              'psd','glur2','nmdar1','nr2b','gad','VGAT',
              'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
              'NOS','TH','VACht','Synapo','tubuli','DAPI')
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
                  'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
                  'in.pre','in.post','in.post','in.post','in.pre.small','other',
                  'ex.post','other','other','ex.post','none','none')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
fchannel <- as.numeric(factor(channel.type,
    levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
    ))
ford <- order(fchannel)
#ccol <- rainbow(max(fchannel))[fchannel]
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","mediumblue","gold")
ccol <- Syncol[fchannel]


fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
#mycol <- colorpanel(100,"blue","grey","red")
#mycol <- redgreen(100)
#mycol <- colorpanel(100, "purple", "black", "blue")
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)

2.1 Transformations

Each of the channels has an arbitrary independent linear transformation and thus converting to z-scores or quantiles is required.

Selecting only the f0 features, we will use both the raw data and a transformed version, transforming by adding 1 and taking the log base 10 for each column and scaling.

f0 <- seq(1,ncol(feat),by=nfeat)
featF0 <- subset(feat, select=f0)

#featF0s <- scale(featF0, scale=TRUE, center=TRUE)
#flog <- apply(X=(featF0+1),2,log10)
#f01 <- data.table(apply(X=featF0, 2, function(x){((x-min(x))/(max(x)-min(x)))}))

#qs <- apply(f01, 2, quantile, probs=c(.025,.5,.75,.99))

### Tunrcating to the inner 98%
tmpG <- featF0[,lapply(.SD,function(x){x >= quantile(x,prob=.01)})]
tmpL <- featF0[,lapply(.SD,function(x){x <= quantile(x,prob=1 - 0.01)})]
ind <- apply(tmpG,1,all)
ind2 <- apply(tmpL,1,all)
fil98F0 <- featF0[ind & ind2,]

### taking log_10
flog <- log10(fil98F0)
fs <- scale(flog, center=TRUE, scale=TRUE)

### scaling between [0,1]
f01 <- data.table(apply(X=flog, 2, function(x){((x-min(x))/(max(x)-min(x)))}))

The data

2.1.1 Kernel Density Estimates of the marginals

df <- melt(as.matrix(flog))
names(df) <- c("ind","channel","value")
ts <- 22
gg1 <- ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    scale_fill_manual(values=ccol) +
    geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=100) +
    geom_density(aes(group=channel, color=channel),size=1.5) +
    facet_wrap(~ channel, scale='fixed') +
    theme(plot.title=element_text(size=ts),
          axis.title.x=element_text(size=ts),
          axis.title.y=element_text(size=ts),
          legend.title=element_text(size=ts),
          legend.text=element_text(size=ts-2),
          axis.text=element_text(size=ts-2),
          strip.text=element_text(size=ts))+
    ggtitle("Kernel Density Estimates of f01 transformed data ")

print(gg1)
Figure 1: Kernel density estimates for each channel, on scaled data.

df <- melt(as.matrix(f01))
names(df) <- c("ind","channel","value")
ts <- 22
gg2 <- ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    scale_fill_manual(values=ccol) +
    geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=21) +
    geom_density(aes(group=channel, color=channel),size=1.5) +
    facet_wrap(~ channel, scale='fixed') +
    theme(plot.title=element_text(size=ts),
          axis.title.x=element_text(size=ts),
          axis.title.y=element_text(size=ts),
          legend.title=element_text(size=ts),
          legend.text=element_text(size=ts-2),
          axis.text=element_text(size=ts-2),
          strip.text=element_text(size=ts))+
    ggtitle("Kernel Density Estimates of f01 transformed data ")

print(gg2)
Figure 2: Kernel density estimates for each channel, on f01 data.

df <- melt(as.matrix(flog))
names(df) <- c("ind","channel","value")
gg3 <- ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    geom_density(aes(group=channel, colour=channel))+
    facet_wrap( ~ channel, scale='free')+
    ggtitle("Kernel Density Estimates on log transformed data.")

print(gg3)
Figure 2: Kernel density estimates for each channel, on f01 data.

df <- melt(as.matrix(fs))
names(df) <- c("ind","channel","value")
ggplot(df, aes(x=value)) + 
    scale_color_manual(values=ccol) +
    geom_density(aes(group=channel, colour=channel))
Figure 3: Kernel density estimates for each channel, on log transformed scaled data.

2.2 K-Means Level 1

** Note that a seed is being set for the random initialization of K-means. **

K1 <- 12  ## Set the upperbound for k-means.

## Run kmeans on the untransformed data
kvecF0 <- foreach(i = 1:K1) %dopar% {
    set.seed(2^13 - 1)
    kmeans(featF0,centers=i)
}

## Run kmeans on the 
## log \circ scale transformed data
kvecfs <- foreach(i = 1:K1) %dopar% {
    set.seed(2^13 - 1)
    kmeans(fs,centers=i)
}

Here we calculate the bic from a user defined function and pick the maximum.

bicF0 <- kbic(kvecF0)
bicfs1 <- kbic(kvecfs)
mxF0 <- t(c(which(bicF0 == max(bicF0)), max(bicF0)))
mxfs1 <- t(c(which(bicfs1 == max(bicfs1)), max(bicfs1)))
par(mfrow=c(1,2))
plot(bicF0, type='b', main="BIC on raw data.")
points(mxF0[,1], mxF0[,2], col='red', pch=20)
plot(bicfs1, type='b', main="BIC on scaled data.")
points(mxfs1[,1], mxfs1[,2], col='red', pch=20)

2.2.1 Heat maps: Untransformed Data

For the following we manualy choose 2 clusters.

## Formatting data for heatmap
feat2 <- aggregate(featF0,by=list(lab=kvecF0[[2]]$cluster),FUN=mean)
feat2 <- as.matrix(feat2[,-1])

rownames(feat2) <- clusterFraction(kvecF0[[2]])

ford <- order(fchannel)
heatmap.2(as.matrix(feat2), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # 
#  [1] "gold"       "#197300"    "#cc0000"    "mediumblue" "#ff9933"   
#  [6] "#660000"    "#cc0000"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#660000"    "#5ed155"    "#cc0000"    "#ff9933"    "mediumblue"
# [16] "#660000"    "mediumblue" "gold"       "#5ed155"    "#5ed155"   
# [21] "#197300"    "#197300"    "#197300"    "#197300"
Figure 4: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to hclust.

heatmap.2(as.matrix(feat2[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) 
#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"
Figure 5: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to synapse type.

2.2.2 Heat maps: Transformed Data

## Formatting data for heatmap
feat2fs <- aggregate(fs,by=list(lab=kvecfs[[2]]$cluster),FUN=mean)
feat2fs <- as.matrix(feat2fs[,-1])

rownames(feat2fs) <- clusterFraction(kvecfs[[2]])
ford <- order(fchannel)
heatmap.2(as.matrix(feat2fs), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # 
#  [1] "#197300"    "#197300"    "#197300"    "#5ed155"    "#cc0000"   
#  [6] "#197300"    "#5ed155"    "#5ed155"    "#5ed155"    "mediumblue"
# [11] "mediumblue" "mediumblue" "gold"       "#ff9933"    "#cc0000"   
# [16] "#ff9933"    "#197300"    "#5ed155"    "#5ed155"    "#cc0000"   
# [21] "#660000"    "gold"       "#660000"    "#660000"
Figure 6: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to hclust.

heatmap.2(as.matrix(feat2fs[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) 
#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"
Figure 7: Heatmap of the cluster means vs channels. Rows and columns are rearranged according to synapse type.

2.3 K-Means Level 2

synOnly<-channel.type!= "none" & channel.type!="other"

fs11 <- fs[kvecfs[[2]]$cluster==1,][,synOnly]
fs12 <- fs[kvecfs[[2]]$cluster==2,][,synOnly]


dim(fs11)
# [1] 423759     19
dim(fs12)
# [1] 379427     19
## Run kmeans on the untransformed data
kvecL21synOnly <- foreach(i = 1:K1) %dopar% {
    set.seed(2^13 - 1)
    kmeans(fs11,centers=i)
}

2.4 PAMK: Level 1

The pamk function performs something similar to K-means.

pamout <- pamk(fs, krange=1:K1,usepam=FALSE,critout=FALSE)
labs <- pamout$pamobject$clustering
plot(pamout$crit,type="b")
Figure 8: Average silhouette width plot of the level 1 clustering.

feat5 <- aggregate(fs,by=list(lab=pamout$pamo$clustering),FUN=mean)

feat5 <- as.matrix(feat5[,-1])
rownames(feat5) <- clusterFraction(labs)

### Reordering based on synapse type.
ford <- order(fchannel)
heatmap.2(as.matrix(feat5), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # 
#  [1] "mediumblue" "mediumblue" "gold"       "#cc0000"    "#ff9933"   
#  [6] "#ff9933"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [11] "#5ed155"    "gold"       "#5ed155"    "#5ed155"    "#5ed155"   
# [16] "#5ed155"    "mediumblue" "#197300"    "#cc0000"    "#5ed155"   
# [21] "#197300"    "#197300"    "#197300"    "#197300"
Figure 9: Default Sorting

heatmap.2(as.matrix(feat5[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) 
#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"
Figure 10: Sorted by Synapse type

2.5 PAMK: Level 2

2.5.1 PAMK: Level 2.1

We begin by splitting the data into the two clusters as calculated above.

fs1 <- fs[pamout$pamobject$clustering==1,]
fs2 <- fs[pamout$pamobject$clustering==2,]
pamout21 <- pamk(fs1, krange=1:K1,usepam=FALSE,critout=FALSE)
labs21 <- pamout21$pamobject$clustering
plot(pamout21$crit,type="b")
Figure 11: Average silhouette width plot of the level 2 clustering.

agg21 <- aggregate(fs1,by=list(lab=pamout21$pamo$clustering),FUN=mean)
agg21 <- as.matrix(agg21[,-1])

cs <- paste(paste0("C", 1:pamout21$nc),
   foreach(i =1:pamout21$nc,.combine=c)%do%{(sprintf("%.2g",sum(labs21==i)/length(labs21)))})
    
rownames(agg21) <- cs
heatmap.2(as.matrix(agg21), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # 
#  [1] "#197300"    "#197300"    "#5ed155"    "#cc0000"    "#197300"   
#  [6] "#197300"    "mediumblue" "mediumblue" "#ff9933"    "#5ed155"   
# [11] "mediumblue" "#197300"    "#5ed155"    "gold"       "#ff9933"   
# [16] "#5ed155"    "#5ed155"    "#660000"    "#660000"    "#660000"   
# [21] "#cc0000"    "#cc0000"    "#5ed155"    "gold"
Figure 12: Default Sorting

heatmap.2(as.matrix(agg21[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) 
#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"
Figure 13: Sorted by Synapse type

2.5.2 PAMK: Level 2.2

pamout22 <- pamk(fs2, krange=1:K1,usepam=FALSE,critout=FALSE)
labs22 <- pamout22$pamobject$clustering
plot(pamout22$crit,type="b")
Figure 14: Average silhouette width plot of the level 2 clustering.

agg22 <- aggregate(fs2,by=list(lab=pamout22$pamo$clustering),FUN=mean)
agg22 <- as.matrix(agg22[,-1])

cs <- paste(paste0("C", 1:pamout22$nc),
   foreach(i =1:pamout22$nc,.combine=c)%do%{(sprintf("%.2g",sum(labs22==i)/length(labs22)))})
    
rownames(agg22) <- cs
heatmap.2(as.matrix(agg22), trace="none",col=rev(mycol),colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # 
#  [1] "mediumblue" "mediumblue" "#660000"    "#660000"    "#197300"   
#  [6] "#197300"    "#660000"    "#cc0000"    "gold"       "mediumblue"
# [11] "#197300"    "#5ed155"    "#197300"    "#197300"    "#5ed155"   
# [16] "#5ed155"    "#5ed155"    "#5ed155"    "#cc0000"    "#cc0000"   
# [21] "gold"       "#ff9933"    "#5ed155"    "#ff9933"
Figure 15: Default Sorting

heatmap.2(as.matrix(agg22[,ford]),dendrogram='row',Colv=NA,trace="none", col=rev(mycol),colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) 
#  [1] "#197300"    "#197300"    "#197300"    "#197300"    "#197300"   
#  [6] "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"    "#5ed155"   
# [11] "#5ed155"    "#660000"    "#660000"    "#660000"    "#cc0000"   
# [16] "#cc0000"    "#cc0000"    "#ff9933"    "#ff9933"    "mediumblue"
# [21] "mediumblue" "mediumblue" "gold"       "gold"
Figure 16: Sorted by Synapse type

tmp <- as.numeric(table(fchannel))
cmat <- cor(fs)
corrplot(cmat[ford,ford],method="color",tl.col=ccol[ford])
corrRect(tmp,col=Syncol,lwd=4)
Figure 17: Correlation on log ยบ scaled data, reordered by synapse type.

3 Exploring pair-wise relationships

set.seed(2^13)
s1 <- sample(1e6,1e4)
dm <- as.matrix(f01[s1,])

c1 <- combn(24,2)
d2 <- foreach(i = 1:dim(c1)[2],.combine=rbind)%do%{
    colset1 <- as.matrix(dm[,c1[,i]])
    colnames(colset1) <- NULL
    l <- paste(channel[c1[,i]][2],channel[c1[,i][1]],sep='~')
        
    data.frame(colset1,l,check.names=FALSE)
}

colnames(d2) <- c("x", "y", "g")

p1 <- xyplot(y ~ x | g, data=d2, 
       panel=function(x,y,...){
           panel.xyplot(x,y,...)
           fit <- lm(y ~ x)
           panel.lines(x,fitted(fit), col.line='red', lwd=2)
       })

print(p1)
p2 <- xyplot(y ~ x | g, data=d2, type=c('p', 'smooth'), col.line='red',lwd=3,pch='.',scales = list(y = list(relation = "free"),x = list(relation = "free")))

pdf('~/Desktop/tmp1.pdf', height=40,width=60) 
print(p1) 
dev.off()

tmp <- as.matrix(f01[s1,])
plot(Synap_2F0 ~ Synap_1F0, data=f01)
chull(tmp)
polygon(tmp[chull(tmp),])

bagplot(tmp[,c(1,2)])