We start with observations of 793 synapses with 12 measurments formatted as a 793x12 matrix. The matrix is Z-scored column-wise and plotted in a pairs plot colored according to the labels given by the field expert.
dat <- read.csv("rorb_gaussianAvg_at.csv")
loc <- read.csv("rorb_gaussianAvg_at_orderLocations.csv")
gabaID <- read.csv("rorb_gaba.csv")
shaftID <- read.csv("rorb_shaft.csv")
TdID <- read.csv("rorb_TdTom.csv")
TdTom <- TdID$TdTom
TdTom[TdTom %in% 3] <- 2 ## ASK ABOUT THIS
shaft <- shaftID$shaft
gaba <- truth <- gabaID$gaba
ccol <- c('blue', 'blue', 'blue', 'red', 'red', 'red', 'green', 'black', 'green', 'green', 'black', 'green')
ind <- order(ccol)
ccol <- sort(ccol)
dat <- dat[,ind]
sdat <- as.data.frame(scale(dat, center = TRUE, scale = TRUE))
stackMraw(sdat, as.factor(gaba), ccol = ccol, centered = TRUE, depth = 2) + ggtitle("True Clusters")
pairs(sdat, col = gaba + 1, pch = 20, cex = 0.2,
main = "Pairs plot colored by true class")
repHeat(sdat); title("Representative heatmap")
We run the Z-scored data through PCA and plot the scree-plot with the elbows as given by Zhu and Ghodsi. A pairs plot is shown on the PCA data again colored by class labels.
pc1 <- princomp(sdat)
getElbows(pc1$sdev)
pairs(pc1$scores, col = gaba +1, pch = 20, cex = 0.5)
## [1] 5 7 10
Next, applying LDA and QDA iterating on the number of PCA dimension included.
ldaL <-
lapply(1:12,
function(x){
ldatmp <- lda(gaba ~ pc1$scores[, 1:x], CV = FALSE)
predict(ldatmp)
})
qdaL <-
lapply(1:12,
function(x){
qdatmp <- qda(gaba ~ pc1$scores[, 1:x], CV = FALSE)
predict(qdatmp)
})
rfL <-
lapply(1:12,
function(x){
tmpdat <- as.matrix(pc1$scores[, 1:x])
set.seed(x)
rftmp <- randomForest(x = tmpdat, y = as.factor(gaba))
predict(rftmp)
})
ll <- sapply(lapply(ldaL, '[[', 1), function(x) sum(x != gaba)/length(gaba))
ql <- sapply(lapply(qdaL, '[[', 1), function(x) sum(x != gaba)/length(gaba))
rl <- sapply(rfL, function(x) sum(x != gaba)/length(gaba))
#png(file = "~/Desktop/supRes.png", 480,480)
plot(x = 1:12, y = seq(0,0.16,length = 12), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(ll, type = 'b', col = 'blue')
points(ql, type = 'b', col = 'orange')
points(rl, type = 'b', col = 'darkgreen', lty = 2, pch = 3)
abline(h = sum(gaba == 1)/length(gaba), col = 'magenta', lty=4)
text(10,max(ql), label = "qda", col = 'orange')
text(10, min(ll), label = "lda", col = 'blue', pos = 1)
text(5, min(rl), label = "rf", col = 'darkgreen', pos = 1)
text(5,sum(gaba == 1)/length(gaba), label = "chance", col = 'magenta', pos = 1)
#dev.off()
qmm <- sapply(1:12, function(x) sum((gaba - qdaL[[x]]$post[, 1])^2))
lmm <- sapply(1:12, function(x) sum((gaba - ldaL[[x]]$post[, 1])^2))
ldaSHAFT <-
lapply(1:12,
function(x){
ldatmp <- lda(shaft ~ pc1$scores[, 1:x], CV = FALSE)
predict(ldatmp)
})
qdaSHAFT <-
lapply(1:12,
function(x){
qdatmp <- qda(shaft ~ pc1$scores[, 1:x], CV = FALSE)
predict(qdatmp)
})
rfSHAFT <-
lapply(1:12,
function(x){
tmpdat <- data.frame(shaft = as.factor(shaft), pc1$scores[, 1:x])
rftmp <- randomForest(shaft ~ ., data = tmpdat)
predict(rftmp)
})
llshaft <- sapply(lapply(ldaSHAFT, '[[', 1), function(x) sum(x != shaft)/length(shaft))
qlshaft <- sapply(lapply(qdaSHAFT, '[[', 1), function(x) sum(x != shaft)/length(shaft))
rfshaft <- sapply(rfSHAFT, function(x) sum(x != shaft)/length(shaft))
#png(file = "~/Desktop/supRes.png", 480,480)
plot(x = 1:12, y = seq(0,0.8,length = 12), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(llshaft, type = 'b', col = 'blue')
points(qlshaft, type = 'b', col = 'darkred')
points(rfshaft, type = 'b', col = 'darkgreen', lty = 3, pch = 3)
abline(h = 1-sum(shaft==0)/length(shaft), lty = 2, col = "magenta")
text(10,max(qlshaft), label = "qda", col = 'darkred')
text(10, min(llshaft), label = "lda", col = 'blue', pos = 1)
text(8, min(rfshaft), label = "rf", col = 'darkgreen', pos = 1)
text(4, 0.29, label = "chance", col = 'magenta', pos = 1)
#dev.off()
qmm <- sapply(1:12, function(x) sum((shaft - qdaSHAFT[[x]]$post[, 1])^2))
lmm <- sapply(1:12, function(x) sum((shaft - ldaSHAFT[[x]]$post[, 1])^2))
Data in \(M_{793 \times 12}\) is transformed using PCA with center=TRUE
and scale=TRUE
to \(D_{793 \times 12}\). Class labels, \(Y_{i\in [793]} \in {0,1}\) have been given by the field experts (quite possibly errorful) with proportions 708 and 85 out of 793 of non-gaba and gaba, respectively.
Let \(n_0, n_1\) be the numbers of points in class 0 and 1, respectively.
n0 <- 708
n1 <- 85
params <- list()
for(i in 1:12){
params[[i]] <- list()
for(j in unique(gaba)){
params[[i]][[as.character(j)]] <-
list(
means = colMeans(as.matrix(pc1$scores[gaba == j, 1:i])),
sigma = if(i == 1){
sd(pc1$scores[gaba == j, 1:i])
} else {
cov(pc1$scores[gaba == j, 1:i])
}
)
}}
mont <- 500
truth <- c(rep(0, n0), rep(1,n1))
s <- list()
for(mi in 1:mont){
set.seed(mi)
s[[mi]] <- rbind(
rmvnorm(n0, mean = params[[12]]$'0'$means, sigma = params[[12]]$'0'$sigma),
rmvnorm(n1, mean = params[[12]]$'1'$means, sigma = params[[12]]$'1'$sigma)
)
}
d1 <- foreach(j = 1:12) %:%
foreach(x = 1:mont, .combine = 'rbind') %dopar% {
qdaR <- qda(truth ~ s[[x]][, 1:j], CV = FALSE)
rcl <- as.numeric(predict(qdaR)$class) - 1
qdaD <- qda(truth ~ s[[x]][, 1:j], CV = TRUE)
(Lr <- sum(rcl != truth)/length(truth))
(Ld <- sum((as.numeric(qdaD$class) - 1) != truth)/length(truth))
#assign(paste0("Lr", j), Lr)
#assign(paste0("Ld", j), Ld)
#cbind(
# eval(as.symbol(paste0("Lr", j))),
# eval(as.symbol(paste0("Ld", j)))
# )
cbind(Lr, Ld)
}
d2 <- foreach(j = 1:12) %:%
foreach(x = 1:mont, .combine = 'rbind') %dopar% {
ldaR <- lda(truth ~ s[[x]][, 1:j], CV = FALSE)
rcl <- as.numeric(predict(ldaR)$class) - 1
ldaD <- lda(truth ~ s[[x]][, 1:j], CV = TRUE)
(Lr <- sum(rcl != truth)/length(truth))
(Ld <- sum((as.numeric(ldaD$class) - 1) != truth)/length(truth))
#assign(paste0("Lr", j), Lr)
#assign(paste0("Ld", j), Ld)
#cbind(
# eval(as.symbol(paste0("Lr", j))),
# eval(as.symbol(paste0("Ld", j)))
# )
cbind(Lr, Ld)
}
qtmp <- cbind(Reduce(rbind, d1))
qtmp1 <- as.data.frame(cbind(data.table::melt(qtmp)))
qtmp1$Var1 <- as.factor(rep(1:12, each = mont))
qtmp1$Var2 <- as.factor(qtmp1$Var2)
names(qtmp1) <- c("dim", "L", "value")
pq <- ggplot(data = qtmp1, aes(x = L, y = value, fill = dim)) +
geom_boxplot(notch = TRUE) + facet_grid(. ~ dim) + ggtitle("QDA")
ltmp <- cbind(Reduce(rbind, d2))
ltmp1 <- as.data.frame(cbind(data.table::melt(ltmp)))
ltmp1$Var1 <- as.factor(rep(1:12, each = mont))
ltmp1$Var2 <- as.factor(ltmp1$Var2)
names(ltmp1) <- c("dim", "L", "value")
pl <- ggplot(data = ltmp1, aes(x = L, y = value, fill = dim)) +
geom_boxplot(notch = TRUE) + facet_grid(. ~ dim) + ggtitle("LDA")
coord <- coord_cartesian(ylim = c(0, 0.11))
grid.arrange(pq + coord,pl +coord, nrow = 1)
dLines <-
foreach(mi = 1:mont) %:%
foreach(x = 1:12, .combine = 'rbind') %dopar% {
qdaR <- qda(truth ~ s[[mi]][, 1:x], CV = FALSE)
rcl <- as.numeric(predict(qdaR)$class) - 1
qdaD <- qda(truth ~ as.matrix(s[[x]][, 1:j]), CV = TRUE)
(Lr <- sum(rcl != truth)/length(truth))
(Ld <- sum((as.numeric(qdaD$class) - 1) != truth)/length(truth))
cbind(Lr, Ld, mi, x)
}
lineDat <- Reduce('rbind', dLines)
pLines <- ggplot(as.data.frame(lineDat), aes(x = x, group = mi)) + geom_line(aes(y = Lr, color = mi))
show(pLines)
grid.arrange(p1sig, lp1sig, nrow = 2)
mu1 <- 1/2
ncDim <- 25
montc <- 500
nc <- 100
nc0 <- nc1 <- nc/2
truC <- c(rep(0,nc0), rep(1, nc1))
sc <- list()
for(mi in 1:montc){
tmp1 <- c(rnorm(nc0, mean = mu1, sd = 1),
rnorm(nc1, mean =-mu1, sd = 1))
tmp2 <- sapply(1:(ncDim-1), function(x) rcauchy(nc, location = 0, scale = 1))
sc[[mi]] <- as.matrix(cbind(tmp1, tmp2))
}
qList <-
foreach(i = 1:montc, .combine = 'rbind') %:%
foreach(j = 1:ncDim, .combine = 'rbind') %dopar% {
dat <- sc[[i]][, 1:j]
qdaR <- qda(truC ~ dat)
rcl <- as.numeric(predict(qdaR)$class) - 1
qdaD <- qda(truC ~ dat, CV = TRUE)
(Lr <- sum(rcl != truC)/length(truC))
(Ld <- sum((as.numeric(qdaD$class) - 1) != truC)/length(truC))
cbind(Lr, Ld, id = i, dim = j)
}
m1 <- melt(qList[, 1:2])
bigMelt <- as.data.frame(cbind(data.table::melt(bigD)))
tmp1$Var1 <- as.factor(rep(1:mdim, each = mont))
tmp1$Var2 <- as.factor(tmp1$Var2)
names(tmp1) <- c("dim", "L", "value")
p1sig <- ggplot(data = tmp1, aes(x = L, y = value, fill = dim)) +
geom_boxplot(notch = TRUE) + facet_grid(. ~ dim)
ggplot(m1, aes(x = as.factor(Var2), y = value)) +
geom_boxplot()