knitr::opts_chunk$set(cache=TRUE, autodep=TRUE)
dep_auto() # figure out dependencies automatically
## Warning in parse_objects(paths[1L]): file hkmeans_cache/html/__objects not
## found
## Warning in parse_objects(paths[2L]): file hkmeans_cache/html/__globals not
## found
opts_chunk$set(cache=FALSE,echo=TRUE,warning=FALSE,message=FALSE,comment="#",
fig.path='../Figures/hkmeans_figure/', dpi=100,dev=c('png','pdf'))
opts_knit$set(aliases=c(h='fig.height', w='fig.width', cap='fig.cap', scap='fig.scap'))
opts_knit$set(eval.after = c('fig.cap','fig.scap'))
knit_hooks$set(document = function(x) {
gsub('(\\\\end\\{knitrout\\}[\n]+)', '\\1\\\\noindent ', x) })
#opts_knit$set(animation.fun = hook_scianimator)
knit_hooks$set(plot = function(x, options) {
paste('<figure><img src="',
opts_knit$get('base.url'), paste(x, collapse = '.'),
'"><figcaption>', options$fig.cap, '</figcaption></figure>',
sep = '')
})
fn = local({
i = 0
function(x) {
i <<- i + 1
paste('Figure ', i, ': ', x, sep = '')
}
})
fig <- local({
i <- 0
ref <- list()
list(
cap=function(refName, text) {
i <<- i + 1
ref[[refName]] <<- i
paste("<b>Figure ", i, ": ", text, "</b><br><br>", sep="")
},
ref=function(refName) {
ref[[refName]]
})
})
Homepage
The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.
On Fri, Dec 11, 2015 at 11:53 AM, joshua vogelstein jovo@jhu.edu wrote:
we will get n=10^6 points, each in d=25 dimensions.
i want to hierarchically cluster them, in a ways:
The corresponds to 24 channels x 6 features per synapse, ordered like c0f0,c0f1,c0f2,c0f3,c0f4,c0f5,c1f0,c1f1โฆ etc
f0 = integrated brightness
f1 = local brightness
f2 = distance to Center of Mass
f3 = moment of inertia around synapsin maxima
f4,f5 are features that I forget what they are.. would need to ask brad.
i would throw them out, I did so in my kohonen code (which you have, its in matlab).
and
On Feb 8, 2016, at 2:00 PM, Kristina Micheva kmicheva@stanford.edu wrote:
and
On March 10, 2016, 00:29:04 (UTC), Kristina Micheva kmicheva@stanford.edu wrote:
There are 2 different Synap channels (2 different antibodies were used), so that part is fine. And 2 different VGluT1 channels (same antibody but done at different times) The NOS channel is the same, so count it as one even though it appears twice. It is listed two times because it can be found at both excitatory and inhibitory synapses. This is where your count of 25 comes, even though there are 24 channels. I would also add the 2 Synap channels to the Inhibitory presynaptic category - there is supposed to be synapsin there, but at lower levels compared to excitatory presynaptic category.
227
in the kohenen.m
file which can be found in the dropbox folder.Synap
and Synap
have been augmented to Synap_1
and Synap_2
for clarity.VGlut1
and VGlut1
have been augmented to VGlut1_t1
and VGlut1_t2
to distinguish between the different times (which are unknown).feat <- fread("../Data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
dim(feat)
# [1] 1119299 144
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
'psd','glur2','nmdar1','nr2b','gad','VGAT',
'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
'NOS','TH','VACht','Synapo','tubuli','DAPI')
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
'in.pre','in.post','in.post','in.post','in.pre.small','other',
'ex.post','other','other','ex.post','none','none')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
fchannel <- as.numeric(factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
ford <- order(fchannel)
#ccol <- rainbow(max(fchannel))[fchannel]
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","mediumblue","gold")
ccol <- Syncol[fchannel]
fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
#mycol <- colorpanel(100,"blue","grey","red")
#mycol <- redgreen(100)
#mycol <- colorpanel(100, "purple", "black", "blue")
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)
Each of the channels has an arbitrary independent linear transformation and thus converting to z-scores or quantiles is required.
Selecting only the f0
features, we will use both the raw data and a transformed version, transforming by adding 1 and taking the log base 10 for each column and scaling.
f0 <- seq(1,ncol(feat),by=nfeat)
featF0 <- subset(feat, select=f0)
#featF0s <- scale(featF0, scale=TRUE, center=TRUE)
#flog <- apply(X=(featF0+1),2,log10)
#f01 <- data.table(apply(X=featF0, 2, function(x){((x-min(x))/(max(x)-min(x)))}))
#qs <- apply(f01, 2, quantile, probs=c(.025,.5,.75,.99))
### Tunrcating to the inner 98%
tmpG <- featF0[,lapply(.SD,function(x){x >= quantile(x,prob=.01)})]
tmpL <- featF0[,lapply(.SD,function(x){x <= quantile(x,prob=1 - 0.01)})]
ind <- apply(tmpG,1,all)
ind2 <- apply(tmpL,1,all)
fil98F0 <- featF0[ind & ind2,]
### taking log_10
flog <- log10(fil98F0)
fs <- scale(flog, center=TRUE, scale=TRUE)
### scaling between [0,1]
f01 <- data.table(apply(X=flog, 2, function(x){((x-min(x))/(max(x)-min(x)))}))
The data
df <- melt(as.matrix(flog))
names(df) <- c("ind","channel","value")
ts <- 22
gg1 <- ggplot(df, aes(x=value)) +
scale_color_manual(values=ccol) +
scale_fill_manual(values=ccol) +
geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=100) +
geom_density(aes(group=channel, color=channel),size=1.5) +
facet_wrap(~ channel, scale='fixed') +
theme(plot.title=element_text(size=ts),
axis.title.x=element_text(size=ts),
axis.title.y=element_text(size=ts),
legend.title=element_text(size=ts),
legend.text=element_text(size=ts-2),
axis.text=element_text(size=ts-2),
strip.text=element_text(size=ts))+
ggtitle("Kernel Density Estimates of f01 transformed data ")
print(gg1)
df <- melt(as.matrix(f01))
names(df) <- c("ind","channel","value")
ts <- 22
gg2 <- ggplot(df, aes(x=value)) +
scale_color_manual(values=ccol) +
scale_fill_manual(values=ccol) +
geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=21) +
geom_density(aes(group=channel, color=channel),size=1.5) +
facet_wrap(~ channel, scale='fixed') +
theme(plot.title=element_text(size=ts),
axis.title.x=element_text(size=ts),
axis.title.y=element_text(size=ts),
legend.title=element_text(size=ts),
legend.text=element_text(size=ts-2),
axis.text=element_text(size=ts-2),
strip.text=element_text(size=ts))+
ggtitle("Kernel Density Estimates of f01 transformed data ")
print(gg2)
df <- melt(as.matrix(flog))
names(df) <- c("ind","channel","value")
gg3 <- ggplot(df, aes(x=value)) +
scale_color_manual(values=ccol) +
geom_density(aes(group=channel, colour=channel))+
facet_wrap( ~ channel, scale='free')+
ggtitle("Kernel Density Estimates on log transformed data.")
print(gg3)
df <- melt(as.matrix(fs))
names(df) <- c("ind","channel","value")
ggplot(df, aes(x=value)) +
scale_color_manual(values=ccol) +
geom_density(aes(group=channel, colour=channel))
** Note that a seed is being set for the random initialization of K-means. **
K1 <- 12 ## Set the upperbound for k-means.
## Run kmeans on the untransformed data
kvecF0 <- foreach(i = 1:K1) %dopar% {
set.seed(2^13 - 1)
kmeans(featF0,centers=i)
}
## Run kmeans on the
## log \circ scale transformed data
kvecfs <- foreach(i = 1:K1) %dopar% {
set.seed(2^13 - 1)
kmeans(fs,centers=i)
}
Here we calculate the bic
from a user defined function and pick the maximum.
bicF0 <- kbic(kvecF0)
bicfs1 <- kbic(kvecfs)
mxF0 <- t(c(which(bicF0 == max(bicF0)), max(bicF0)))
mxfs1 <- t(c(which(bicfs1 == max(bicfs1)), max(bicfs1)))
par(mfrow=c(1,2))
plot(bicF0, type='b', main="BIC on raw data.")
points(mxF0[,1], mxF0[,2], col='red', pch=20)
plot(bicfs1, type='b', main="BIC on scaled data.")
points(mxfs1[,1], mxfs1[,2], col='red', pch=20)
For the following we manualy choose 2 clusters.
## Formatting data for heatmap
feat2 <- aggregate(featF0,by=list(lab=kvecF0[[2]]$cluster),FUN=mean)
feat2 <- as.matrix(feat2[,-1])
rownames(feat2) <- clusterFraction(kvecF0[[2]])
ford <- order(fchannel)
heatmap.2(as.matrix(feat2), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #
# [1] "gold" "#197300" "#cc0000" "mediumblue" "#ff9933"
# [6] "#660000" "#cc0000" "#5ed155" "#5ed155" "#5ed155"
# [11] "#660000" "#5ed155" "#cc0000" "#ff9933" "mediumblue"
# [16] "#660000" "mediumblue" "gold" "#5ed155" "#5ed155"
# [21] "#197300" "#197300" "#197300" "#197300"
heatmap.2(as.matrix(feat2[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)
# [1] "#197300" "#197300" "#197300" "#197300" "#197300"
# [6] "#5ed155" "#5ed155" "#5ed155" "#5ed155" "#5ed155"
# [11] "#5ed155" "#660000" "#660000" "#660000" "#cc0000"
# [16] "#cc0000" "#cc0000" "#ff9933" "#ff9933" "mediumblue"
# [21] "mediumblue" "mediumblue" "gold" "gold"
## Formatting data for heatmap
feat2fs <- aggregate(fs,by=list(lab=kvecfs[[2]]$cluster),FUN=mean)
feat2fs <- as.matrix(feat2fs[,-1])
rownames(feat2fs) <- clusterFraction(kvecfs[[2]])
ford <- order(fchannel)
heatmap.2(as.matrix(feat2fs), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #
# [1] "#197300" "#197300" "#197300" "#5ed155" "#cc0000"
# [6] "#197300" "#5ed155" "#5ed155" "#5ed155" "mediumblue"
# [11] "mediumblue" "mediumblue" "gold" "#ff9933" "#cc0000"
# [16] "#ff9933" "#197300" "#5ed155" "#5ed155" "#cc0000"
# [21] "#660000" "gold" "#660000" "#660000"
heatmap.2(as.matrix(feat2fs[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)
# [1] "#197300" "#197300" "#197300" "#197300" "#197300"
# [6] "#5ed155" "#5ed155" "#5ed155" "#5ed155" "#5ed155"
# [11] "#5ed155" "#660000" "#660000" "#660000" "#cc0000"
# [16] "#cc0000" "#cc0000" "#ff9933" "#ff9933" "mediumblue"
# [21] "mediumblue" "mediumblue" "gold" "gold"
synOnly<-channel.type!= "none" & channel.type!="other"
fs11 <- fs[kvecfs[[2]]$cluster==1,][,synOnly]
fs12 <- fs[kvecfs[[2]]$cluster==2,][,synOnly]
dim(fs11)
# [1] 423759 19
dim(fs12)
# [1] 379427 19
## Run kmeans on the untransformed data
kvecL21synOnly <- foreach(i = 1:K1) %dopar% {
set.seed(2^13 - 1)
kmeans(fs11,centers=i)
}
The pamk
function performs something similar to K-means.
pamout <- pamk(fs, krange=1:K1,usepam=FALSE,critout=FALSE)
labs <- pamout$pamobject$clustering
plot(pamout$crit,type="b")
feat5 <- aggregate(fs,by=list(lab=pamout$pamo$clustering),FUN=mean)
feat5 <- as.matrix(feat5[,-1])
rownames(feat5) <- clusterFraction(labs)
### Reordering based on synapse type.
ford <- order(fchannel)
heatmap.2(as.matrix(feat5), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #
# [1] "mediumblue" "mediumblue" "gold" "#cc0000" "#ff9933"
# [6] "#ff9933" "#660000" "#660000" "#660000" "#cc0000"
# [11] "#5ed155" "gold" "#5ed155" "#5ed155" "#5ed155"
# [16] "#5ed155" "mediumblue" "#197300" "#cc0000" "#5ed155"
# [21] "#197300" "#197300" "#197300" "#197300"
heatmap.2(as.matrix(feat5[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)
# [1] "#197300" "#197300" "#197300" "#197300" "#197300"
# [6] "#5ed155" "#5ed155" "#5ed155" "#5ed155" "#5ed155"
# [11] "#5ed155" "#660000" "#660000" "#660000" "#cc0000"
# [16] "#cc0000" "#cc0000" "#ff9933" "#ff9933" "mediumblue"
# [21] "mediumblue" "mediumblue" "gold" "gold"
We begin by splitting the data into the two clusters as calculated above.
fs1 <- fs[pamout$pamobject$clustering==1,]
fs2 <- fs[pamout$pamobject$clustering==2,]
pamout21 <- pamk(fs1, krange=1:K1,usepam=FALSE,critout=FALSE)
labs21 <- pamout21$pamobject$clustering
plot(pamout21$crit,type="b")
agg21 <- aggregate(fs1,by=list(lab=pamout21$pamo$clustering),FUN=mean)
agg21 <- as.matrix(agg21[,-1])
cs <- paste(paste0("C", 1:pamout21$nc),
foreach(i =1:pamout21$nc,.combine=c)%do%{(sprintf("%.2g",sum(labs21==i)/length(labs21)))})
rownames(agg21) <- cs
heatmap.2(as.matrix(agg21), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #
# [1] "#197300" "#197300" "#5ed155" "#cc0000" "#197300"
# [6] "#197300" "mediumblue" "mediumblue" "#ff9933" "#5ed155"
# [11] "mediumblue" "#197300" "#5ed155" "gold" "#ff9933"
# [16] "#5ed155" "#5ed155" "#660000" "#660000" "#660000"
# [21] "#cc0000" "#cc0000" "#5ed155" "gold"
heatmap.2(as.matrix(agg21[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)
# [1] "#197300" "#197300" "#197300" "#197300" "#197300"
# [6] "#5ed155" "#5ed155" "#5ed155" "#5ed155" "#5ed155"
# [11] "#5ed155" "#660000" "#660000" "#660000" "#cc0000"
# [16] "#cc0000" "#cc0000" "#ff9933" "#ff9933" "mediumblue"
# [21] "mediumblue" "mediumblue" "gold" "gold"
pamout22 <- pamk(fs2, krange=1:K1,usepam=FALSE,critout=FALSE)
labs22 <- pamout22$pamobject$clustering
plot(pamout22$crit,type="b")
agg22 <- aggregate(fs2,by=list(lab=pamout22$pamo$clustering),FUN=mean)
agg22 <- as.matrix(agg22[,-1])
cs <- paste(paste0("C", 1:pamout22$nc),
foreach(i =1:pamout22$nc,.combine=c)%do%{(sprintf("%.2g",sum(labs22==i)/length(labs22)))})
rownames(agg22) <- cs
heatmap.2(as.matrix(agg22), trace="none",col=rev(mycol),colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) #
# [1] "mediumblue" "mediumblue" "#660000" "#660000" "#197300"
# [6] "#197300" "#660000" "#cc0000" "gold" "mediumblue"
# [11] "#197300" "#5ed155" "#197300" "#197300" "#5ed155"
# [16] "#5ed155" "#5ed155" "#5ed155" "#cc0000" "#cc0000"
# [21] "gold" "#ff9933" "#5ed155" "#ff9933"
heatmap.2(as.matrix(agg22[,ford]),dendrogram='row',Colv=NA,trace="none", col=rev(mycol),colCol=ccol[ford],cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90)
# [1] "#197300" "#197300" "#197300" "#197300" "#197300"
# [6] "#5ed155" "#5ed155" "#5ed155" "#5ed155" "#5ed155"
# [11] "#5ed155" "#660000" "#660000" "#660000" "#cc0000"
# [16] "#cc0000" "#cc0000" "#ff9933" "#ff9933" "mediumblue"
# [21] "mediumblue" "mediumblue" "gold" "gold"
tmp <- as.numeric(table(fchannel))
cmat <- cor(fs)
corrplot(cmat[ford,ford],method="color",tl.col=ccol[ford])
corrRect(tmp,col=Syncol,lwd=4)
set.seed(2^13)
s1 <- sample(1e6,1e4)
dm <- as.matrix(f01[s1,])
c1 <- combn(24,2)
d2 <- foreach(i = 1:dim(c1)[2],.combine=rbind)%do%{
colset1 <- as.matrix(dm[,c1[,i]])
colnames(colset1) <- NULL
l <- paste(channel[c1[,i]][2],channel[c1[,i][1]],sep='~')
data.frame(colset1,l,check.names=FALSE)
}
colnames(d2) <- c("x", "y", "g")
p1 <- xyplot(y ~ x | g, data=d2,
panel=function(x,y,...){
panel.xyplot(x,y,...)
fit <- lm(y ~ x)
panel.lines(x,fitted(fit), col.line='red', lwd=2)
})
print(p1)
p2 <- xyplot(y ~ x | g, data=d2, type=c('p', 'smooth'), col.line='red',lwd=3,pch='.',scales = list(y = list(relation = "free"),x = list(relation = "free")))
pdf('~/Desktop/tmp1.pdf', height=40,width=60)
print(p1)
dev.off()
tmp <- as.matrix(f01[s1,])
plot(Synap_2F0 ~ Synap_1F0, data=f01)
chull(tmp)
polygon(tmp[chull(tmp),])
bagplot(tmp[,c(1,2)])