Homepage
The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.
Here we read in the data and select a random half of it for exploration.
featFull <- fread("../data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
### Setting a seed and creating an index vector
### to select half of the data
set.seed(2^10)
half1 <- sample(dim(featFull)[1],dim(featFull)[1]/2)
half2 <- setdiff(1:dim(featFull)[1],half1)
feat <- featFull[half1,]
dim(feat)
# [1] 559649 144
## Setting the channel names
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
'psd','glur2','nmdar1','nr2b','gad','VGAT',
'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
'NOS','TH','VACht','Synapo','tubuli','DAPI')
## Setting the channel types
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
'in.pre','in.post','in.post','in.post','in.pre.small','other',
'ex.post','other','other','ex.post','none','none')
channel.type2 <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','other',
'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
'in.pre','in.post','in.post','in.post','other','other',
'ex.post','other','other','ex.post','other','other')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
## Createing factor variables for channel and channel type sorted properly
ffchannel <- (factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
fchannel <- as.numeric(factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
ford <- order(fchannel)
## Setting up colors for channel types
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","#0000cd","#ffd700")
Syncol3 <- c("#197300","#197300","#cc0000","#cc0000","#0000cd","#0000cd","#0000cd")
ccol <- Syncol[fchannel]
ccol3 <- Syncol3[fchannel]
exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","#0000cd");
exCol <- as.character(exCol)
fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)
mycol3 <- colorpanel(100, "blue","white","red")
f <- lapply(1:6,function(x){seq(x,ncol(feat),by=nfeat)})
featF <- lapply(f,function(x){subset(feat,select=x)})
featF0 <- featF[[1]]
f01e3 <- lapply(featF,function(x){
1e3*data.table(apply(X=x,2,function(y){((y-min(y))/(max(y)-min(y)))}))
})
fs <- f01e3[[1]]
### Taking log_10 on data + 1.
log1f <- log10(featF0 + 1)
slog1f <- data.table(scale(log1f, center=TRUE,scale=TRUE))
We now have the following data sets:
featF0
: The feature vector looking only at the integrated brightness features.fs
: The feature vector scaled between \([0,1000]\).logf1
: The feature vector, plus one, then \(log_{10}\) is applied.slog1f
: The feature vector, plus one, \(log_{10}\), then scaled by subtracting the mean and dividing by the sample standard deviation.#corrf <- lapply(lapply(f01e3, cor), function(x) x[ford, ford])
corrf <- lapply(lapply(featF, cor), function(x) x[ford, ford])
titles <- paste('Correlation Matrix of', paste0("F", 0:5))
par(mfrow = c(3,2))
for(i in 1:length(corrf)) {
corrplot(corrf[[i]],method="color",tl.col=ccol[ford],
tl.cex=0.8, mar=c(1,0,1,1.5))
title(titles[i])
}
bford <- order(rep(fchannel,each=6))
nord <- Reduce('c', f)
cr <- rep(ccol, each=6)
corrfB <- cor(feat)[bford,bford]
corrplot(corrfB,method="color",tl.col=cr[bford],tl.cex=0.75)
computeDcor <- function(x) {
set.seed(317)
sam1 <- sample(dim(x)[1], min(1e3,dim(x)[1]))
tmp <- as.data.frame((x[sam1,]))
combcols <- t(combn(24,2))
dc <- foreach(i = 1:dim(combcols)[1]) %dopar% {
set.seed(331*i)
dcor.ttest(x=tmp[,combcols[i,1]],y=tmp[,combcols[i,2]])
}
ms <- matrix(as.numeric(0),24,24)
mp <- matrix(as.numeric(0),24,24)
for(i in 1:length(dc)){
ms[combcols[i,1],combcols[i,2]] <- dc[[i]]$statistic
ms[combcols[i,2],combcols[i,1]] <- dc[[i]]$statistic
mp[combcols[i,1],combcols[i,2]] <- dc[[i]]$p.val
mp[combcols[i,2],combcols[i,1]] <- dc[[i]]$p.val
}
rownames(ms) <- colnames(x)
rownames(mp) <- colnames(x)
colnames(ms) <- colnames(x)
colnames(mp) <- colnames(x)
diag(ms) <- as.numeric(0)
diag(mp) <- as.numeric(1)
return(list(ms, mp))
}
mdcor <- lapply(featF, computeDcor)
cl5 <- colorRampPalette(c("white", "blue"))
gr5 <- colorRampPalette(c("darkgreen", "white", "white"))
bl5 <- colorRampPalette(c("blue", "red"))
sTitle <- paste("dcor.ttest statistic", paste0("F", 0:5))
pTitle <- paste("dcor.ttest log10(p-value)", paste0("F", 0:5))
par(mfcol=c(6,2), oma=2*c(1,1,1,1))
for(i in 1:length(mdcor)){
corrplot(mdcor[[i]][[1]][ford,ford],is.corr=FALSE,method="color",
tl.col=ccol[ford], tl.cex=0.8, mar=3*c(1,0,1,1.5))
corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4)
title(sTitle[i])
}
for(i in 1:length(mdcor)){
corrplot((log10(mdcor[[i]][[2]][ford,ford]+.Machine$double.eps)),is.corr=FALSE,method="color",tl.col=ccol[ford],
tl.cex=0.8, mar=3*c(1,0,1,1.5),
p.mat=mdcor[[i]][[2]][ford,ford],
sig.level=0.01, pch = 'x', pch.cex=0.5)
corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4)
title(pTitle[i])
}
The X
’s in the above right figure denote a p-value greater than 0.01.
We will run PCA on the untransformed correlation matrix so the data can be viewed in 2-dimensions. The colors correspond to synapse type.
pcaL <- lapply(corrf, prcomp, center=TRUE, scale=TRUE)
titlepca <- paste("PCA on ", paste0("cor(F", 0:5, ')'))
for(i in 1:length(pcaL)) {
pairs(pcaL[[i]]$x[,1:3], col=ccol3[ford],pch=20, cex=2,
main=titlepca[i])
}
pca <- pcaL[[1]]$x
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s',col=ccol3[ford], size=1,
xlab = "PC1", ylab = "PC2", zlab = "PC3")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3],abbreviate(rownames(pca)), col=ccol3[ford], adj=c(0,2))
title3d(main = "PCA on cor(F0)")
subid <- currentSubscene3d()
rglwidget(elementId="rgl-pca1",width=720,height=720)
pcaB <- prcomp(corrfB,center=TRUE, scale=TRUE)
pairs(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2)
plot(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2)
text(pcaB$x[,1:2], labels=rownames(pcaB$x), pos=4, col=cr[bford])
pcaB <- pcaB$x
rgl::plot3d(pcaB[,1],pcaB[,2],pcaB[,3],type='s',col=cr[bford], size=1)
rgl::rgl.texts(pcaB[,1],pcaB[,2],pcaB[,3],abbreviate(rownames(pcaB)), col=cr[bford], adj=c(0,2))
subid <- currentSubscene3d()
rglwidget(elementId="rgl-pcaB",width=720,height=720)
First we will take a look at the scree plots for each of the primitives along with their corresponding elbows as given by Zhu and Ghodsi 2006.
par(mfrow= c(2,3))
for(i in 1:length(pcaL)){
zs <- rep(1, dim(pcaL[[i]]$x)[2])
zb <- rep(0, dim(pcaL[[i]]$x)[2])
zs[el2[[i]]] <- 2
zb[el2[[i]]] <- 2
plot(100 * cumsum(pcaL[[i]]$sdev) / sum(pcaL[[i]]$sdev),
axes = TRUE,
type='b', pch = 21,
bg = zb, col = zs,
xlab = 'd', ylab = '% Var',
main = paste0('% Var Explained for F', i - 1))
axis(side = 1, at = el2[[i]], padj = 1.25)
}
dat <- lapply(pcaL, function(x) data.frame(x$x))
type <- truth <- factor(exType, ordered = FALSE)
tr <- as.numeric(truth)
lda.fit <- list()
lda.fit[[1]] <- lda(type ~ ., data = dat[[1]][,1:el2[[1]][2]])
lda.fit[[2]] <- lda(type ~ ., data = dat[[2]][,1:el2[[2]][2]])
lda.fit[[3]] <- lda(type ~ ., data = dat[[3]][,1:el2[[3]][2]])
lda.fit[[4]] <- lda(type ~ ., data = dat[[4]][,1:el2[[4]][2]])
lda.fit[[5]] <- lda(type ~ ., data = dat[[5]][,1:el2[[5]][2]])
lda.fit[[6]] <- lda(type ~ ., data = dat[[6]][,1:el2[[6]][1]])
lda.pred <- lapply(lda.fit, predict)
Lm <-
foreach(i = 1:6, .combine=rbind) %:%
foreach(j = 1:24, .combine=rbind) %do% {
out <- NULL
try({
set.seed(12)
ldapred <- as.numeric(predict(lda(type ~ ., dat[[i]][1:j]))$class)
er <- 1/24 * sum(ldapred != as.numeric(truth))
out <- data.frame(Lhat = er, d = j, feat = as.factor(i - 1))
}, silent=TRUE)
out
}
rownames(Lm) <- NULL
Lm <- data.table(Lm)
Lcvm <-
foreach(i = 1:6, .combine=rbind) %:%
foreach(j = 1:24, .combine=rbind) %do% {
out <- NULL
try({
set.seed(12)
ldapred <- as.numeric((lda(type ~ ., dat[[i]][1:j], CV = TRUE))$class)
er <- 1/24 * sum(ldapred != as.numeric(truth))
out <- data.frame(Lhat = er, d = j, feat = as.factor(i - 1))
}, silent=TRUE)
out
}
rownames(Lcvm) <- NULL
Lcvm <- data.table(Lcvm)
Lm$CV <- "Without CV"
Lcvm$CV <- "With CV"
tmp <- data.table(rbind(Lm, Lcvm))
tmp$CV <- factor(tmp$CV)
tmp$feat <- factor(tmp$feat)
size <- foreach(i = 1:6, .combine=c) %do% {
size <- rep(1, 23)
size[el2[[i]]] <- 2
size
}
tmp$size <- factor(c(size,size))
levels(tmp$size) <- c("non-elbow", "elbow")
xvalVr <-
ggplot(tmp, aes(x = d, y = Lhat, color = feat)) +
facet_grid(. ~ CV) +
geom_line(alpha = 0.5) +
geom_point(alpha = 0.65, aes(size = size)) +
geom_hline(yintercept = 0.64236) +
ggtitle("FLD (LDA): xval vs. re-substitution")
print(xvalVr)
L <-
foreach(i = 1:6) %:%
foreach(j = 1:24, .combine = cbind) %do% {
out <- NULL
try({
set.seed(12)
ldapredcv <- as.numeric(lda(type ~ ., dat[[i]][1:j], CV = TRUE)$class)
out <- ldapredcv
}, silent=TRUE)
out
}
A <- lapply(L, function(x) { apply(x, 2, function(y) y != tr ) })
B <- lapply(A, function(x) { x[,1:10] } )
B <- lapply(B, function(x) {
rownames(x) <- channel[ford]
colnames(x) <- paste0('d', 1:dim(x)[2])
return(x)
})
par(mfrow = c(2,3))
corT <- paste0("Misclassifications ", "F", 0:5)
for(i in 1:6) {
corrplot(B[[i]], method='color', addgrid.col=1,
tl.col = ccol[ford],
mar=3*c(1,0,1,1.5))
title(corT[i])
}
The above plots show when LDA with cross-validation misclassifies each datapoint. A misclassification is denoted by a filled block and columns denote the embedding dimension \(\hat d\).
titlesvor <- paste("LDA decision boundaries for", paste0("F", 0:5))
voronoidf <- lapply(lapply(lda.fit, '[[', 3), data.frame)
#voronoidf <- data.frame(x=lda.fit$means[,1],y=lda.fit$means[,2])
#This creates the voronoi line segments
par(mfrow = c(3,2))
for(i in 1:length(dat)){
plot(dat[[i]][,1:2], col=ccol3[ford], pch=20, cex=1.5)
title(titlesvor[i])
text(dat[[i]][,1:2], labels=rownames(dat[[i]]),
pos=ifelse(dat[[i]][,1]<max(dat[[i]][,1] -0.5),4,2),
col=ccol3[ford], cex=1.2)
deldir(x = voronoidf[[i]][,1],y = voronoidf[[i]][,2], rw = c(-15,15,-15,15),
plotit=TRUE, add=TRUE, wl='te')
text(voronoidf[[i]], labels=rownames(voronoidf[[i]]), cex=1.5, pos=1)
}
Recall that these are two-dimensional visualizations of a 24-dimensional structure.
pred <- lapply(lapply(lda.fit, predict), '[[', 1)
tabN <- paste0("Table for F", 0:5)
for( i in 1:length(pred)) {
print(tabN[i])
print(table(truth,pred = pred[[i]]) )
}
# [1] "Table for F0"
# pred
# truth ex in other
# ex 11 0 0
# in 0 6 0
# other 0 0 7
# [1] "Table for F1"
# pred
# truth ex in other
# ex 11 0 0
# in 0 6 0
# other 0 0 7
# [1] "Table for F2"
# pred
# truth ex in other
# ex 11 0 0
# in 0 6 0
# other 0 0 7
# [1] "Table for F3"
# pred
# truth ex in other
# ex 11 0 0
# in 0 6 0
# other 0 0 7
# [1] "Table for F4"
# pred
# truth ex in other
# ex 11 0 0
# in 0 6 0
# other 0 0 7
# [1] "Table for F5"
# pred
# truth ex in other
# ex 11 0 0
# in 0 6 0
# other 0 0 7
lda.err <- list()
for(i in 1:length(pred)) {
pr <- as.numeric(pred[[i]])
lda.err[[i]] <- sum( tr != pr ) / length(tr)
}
Reduce('c', lda.err)
# [1] 0 0 0 0 0 0
The above gives the LDA re-substitution error rates for each of the features.