1 Synapse data

We start with observations of 793 synapses with 12 measurments formatted as a 793x12 matrix. The matrix is Z-scored column-wise and plotted in a pairs plot colored according to the labels given by the field expert.

dat <- read.csv("rorb_gaussianAvg_at.csv")
loc <- read.csv("rorb_gaussianAvg_at_orderLocations.csv")
gabaID <- read.csv("rorb_gaba.csv")
shaftID <- read.csv("rorb_shaft.csv")
TdID <- read.csv("rorb_TdTom.csv")

TdTom <- TdID$TdTom
TdTom[TdTom %in% 3] <- 2 ## ASK ABOUT THIS
shaft <- shaftID$shaft
gaba <- truth <- gabaID$gaba

ccol <- c('blue', 'blue', 'blue', 'red', 'red', 'red', 'green', 'black', 'green', 'green', 'black', 'green')
ind <- order(ccol)
ccol <- sort(ccol)

dat <- dat[,ind]
sdat <- as.data.frame(scale(dat, center = TRUE, scale = TRUE))
stackMraw(sdat, as.factor(gaba), ccol = ccol, centered = TRUE, depth = 2) + ggtitle("True Clusters")

pairs(sdat, col = gaba + 1, pch = 20, cex = 0.2,
main = "Pairs plot colored by true class")

repHeat(sdat); title("Representative heatmap")

1.1 ZG: Get Elbows

We run the Z-scored data through PCA and plot the scree-plot with the elbows as given by Zhu and Ghodsi. A pairs plot is shown on the PCA data again colored by class labels.

pc1 <- princomp(sdat)
getElbows(pc1$sdev)

pairs(pc1$scores, col = gaba +1, pch = 20, cex = 0.5)

## [1]  5  7 10

1.2 LDA and QDA

Next, applying LDA and QDA iterating on the number of PCA dimension included.

ldaL <- 
  lapply(1:12, 
         function(x){
           ldatmp <- lda(gaba ~ pc1$scores[, 1:x], CV = FALSE)
           predict(ldatmp)
         })

qdaL <- 
  lapply(1:12, 
         function(x){
           qdatmp <- qda(gaba ~ pc1$scores[, 1:x], CV = FALSE)
           predict(qdatmp)
         })

rfL <- 
  lapply(1:12, 
         function(x){
           tmpdat <- as.matrix(pc1$scores[, 1:x])
           set.seed(x)
           rftmp <- randomForest(x = tmpdat, y = as.factor(gaba))
           predict(rftmp)
         })

ll <- sapply(lapply(ldaL, '[[', 1), function(x) sum(x != gaba)/length(gaba))
ql <- sapply(lapply(qdaL, '[[', 1), function(x) sum(x != gaba)/length(gaba))
rl <- sapply(rfL, function(x) sum(x != gaba)/length(gaba))


#png(file = "~/Desktop/supRes.png", 480,480)
plot(x = 1:12, y = seq(0,0.16,length = 12), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(ll, type = 'b', col = 'blue')
points(ql, type = 'b', col = 'orange')
points(rl, type = 'b', col = 'darkgreen', lty = 2, pch = 3)
abline(h = sum(gaba == 1)/length(gaba), col = 'magenta', lty=4)
text(10,max(ql), label = "qda", col = 'orange')
text(10, min(ll), label = "lda", col = 'blue', pos = 1)
text(5, min(rl), label = "rf", col = 'darkgreen', pos = 1)
text(5,sum(gaba == 1)/length(gaba), label = "chance", col = 'magenta', pos = 1)

#dev.off()

qmm <- sapply(1:12, function(x) sum((gaba - qdaL[[x]]$post[, 1])^2))
lmm <- sapply(1:12, function(x) sum((gaba - ldaL[[x]]$post[, 1])^2))
ldaSHAFT <- 
  lapply(1:12, 
         function(x){
           ldatmp <- lda(shaft ~ pc1$scores[, 1:x], CV = FALSE)
           predict(ldatmp)
         })

qdaSHAFT <- 
  lapply(1:12, 
         function(x){
           qdatmp <- qda(shaft ~ pc1$scores[, 1:x], CV = FALSE)
           predict(qdatmp)
         })

rfSHAFT <- 
  lapply(1:12, 
         function(x){
          tmpdat <- data.frame(shaft = as.factor(shaft), pc1$scores[, 1:x])
           rftmp <- randomForest(shaft ~ ., data = tmpdat)
           predict(rftmp)
         })

llshaft <- sapply(lapply(ldaSHAFT, '[[', 1), function(x) sum(x != shaft)/length(shaft))
qlshaft <- sapply(lapply(qdaSHAFT, '[[', 1), function(x) sum(x != shaft)/length(shaft))
rfshaft <- sapply(rfSHAFT, function(x) sum(x != shaft)/length(shaft))

#png(file = "~/Desktop/supRes.png", 480,480)
plot(x = 1:12, y = seq(0,0.8,length = 12), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(llshaft, type = 'b', col = 'blue')
points(qlshaft, type = 'b', col = 'darkred')
points(rfshaft, type = 'b', col = 'darkgreen', lty = 3, pch = 3)
abline(h = 1-sum(shaft==0)/length(shaft), lty = 2, col = "magenta")
text(10,max(qlshaft), label = "qda", col = 'darkred')
text(10, min(llshaft), label = "lda", col = 'blue', pos = 1)
text(8, min(rfshaft), label = "rf", col = 'darkgreen', pos = 1)
text(4, 0.29, label = "chance", col = 'magenta', pos = 1)

#dev.off()

qmm <- sapply(1:12, function(x) sum((shaft - qdaSHAFT[[x]]$post[, 1])^2))
lmm <- sapply(1:12, function(x) sum((shaft - ldaSHAFT[[x]]$post[, 1])^2))

1.3 Office Chat CEP 20180228

Data in \(M_{793 \times 12}\) is transformed using PCA with center=TRUE and scale=TRUE to \(D_{793 \times 12}\). Class labels, \(Y_{i\in [793]} \in {0,1}\) have been given by the field experts (quite possibly errorful) with proportions 708 and 85 out of 793 of non-gaba and gaba, respectively.

Let \(n_0, n_1\) be the numbers of points in class 0 and 1, respectively.

n0 <- 708
n1 <- 85

params <- list()
for(i in 1:12){
  params[[i]] <- list()
  for(j in unique(gaba)){
    params[[i]][[as.character(j)]] <- 
      list( 
           means = colMeans(as.matrix(pc1$scores[gaba == j, 1:i])), 
           sigma   = if(i == 1){
             sd(pc1$scores[gaba == j, 1:i])
           } else {
                  cov(pc1$scores[gaba == j, 1:i])
                  }
           )
  }}

1.4 Sampling

mont <- 500
truth  <- c(rep(0, n0), rep(1,n1))

s <- list()
for(mi in 1:mont){
set.seed(mi)
    s[[mi]] <- rbind( 
               rmvnorm(n0, mean = params[[12]]$'0'$means, sigma = params[[12]]$'0'$sigma),
               rmvnorm(n1, mean = params[[12]]$'1'$means, sigma = params[[12]]$'1'$sigma)
              )
}
        
d1 <- foreach(j = 1:12) %:% 
      foreach(x = 1:mont, .combine = 'rbind') %dopar% {
         
        qdaR <- qda(truth ~ s[[x]][, 1:j], CV = FALSE)
        rcl <- as.numeric(predict(qdaR)$class) - 1
        
        qdaD <- qda(truth ~ s[[x]][, 1:j], CV = TRUE)
        
        (Lr <- sum(rcl != truth)/length(truth))
        (Ld <- sum((as.numeric(qdaD$class) - 1) != truth)/length(truth))
        #assign(paste0("Lr", j), Lr)
        #assign(paste0("Ld", j), Ld)

        #cbind(
        #      eval(as.symbol(paste0("Lr", j))), 
        #      eval(as.symbol(paste0("Ld", j)))
        #      )
        cbind(Lr, Ld)
}

d2 <- foreach(j = 1:12) %:% 
      foreach(x = 1:mont, .combine = 'rbind') %dopar% {
         
        ldaR <- lda(truth ~ s[[x]][, 1:j], CV = FALSE)
        rcl <- as.numeric(predict(ldaR)$class) - 1
        
        ldaD <- lda(truth ~ s[[x]][, 1:j], CV = TRUE)
        
        (Lr <- sum(rcl != truth)/length(truth))
        (Ld <- sum((as.numeric(ldaD$class) - 1) != truth)/length(truth))
        #assign(paste0("Lr", j), Lr)
        #assign(paste0("Ld", j), Ld)

        #cbind(
        #      eval(as.symbol(paste0("Lr", j))), 
        #      eval(as.symbol(paste0("Ld", j)))
        #      )
        cbind(Lr, Ld)
}

qtmp <- cbind(Reduce(rbind, d1))
qtmp1 <- as.data.frame(cbind(data.table::melt(qtmp)))

qtmp1$Var1 <- as.factor(rep(1:12, each = mont))
qtmp1$Var2 <- as.factor(qtmp1$Var2)
names(qtmp1) <- c("dim", "L", "value")

pq <- ggplot(data = qtmp1, aes(x = L, y = value, fill = dim)) + 
     geom_boxplot(notch = TRUE) + facet_grid(. ~ dim) + ggtitle("QDA")

ltmp <- cbind(Reduce(rbind, d2))
ltmp1 <- as.data.frame(cbind(data.table::melt(ltmp)))

ltmp1$Var1 <- as.factor(rep(1:12, each = mont))
ltmp1$Var2 <- as.factor(ltmp1$Var2)
names(ltmp1) <- c("dim", "L", "value")

pl <- ggplot(data = ltmp1, aes(x = L, y = value, fill = dim)) + 
     geom_boxplot(notch = TRUE) + facet_grid(. ~ dim) + ggtitle("LDA")
coord <- coord_cartesian(ylim = c(0, 0.11))
grid.arrange(pq + coord,pl +coord, nrow = 1)
dLines <- 
foreach(mi = 1:mont) %:%
foreach(x = 1:12, .combine = 'rbind') %dopar% {
  qdaR <- qda(truth ~ s[[mi]][, 1:x], CV = FALSE)
  rcl <- as.numeric(predict(qdaR)$class) - 1
        
  qdaD <- qda(truth ~ as.matrix(s[[x]][, 1:j]), CV = TRUE)
        
  (Lr <- sum(rcl != truth)/length(truth))
  (Ld <- sum((as.numeric(qdaD$class) - 1) != truth)/length(truth))
  cbind(Lr, Ld, mi, x)
}

lineDat <- Reduce('rbind', dLines)
pLines <- ggplot(as.data.frame(lineDat), aes(x = x, group = mi)) + geom_line(aes(y = Lr, color = mi)) 
show(pLines)
grid.arrange(p1sig, lp1sig, nrow = 2)

2 Model mis-specification

mu1 <- 1/2
ncDim <- 25
montc <- 500

nc <- 100
nc0 <- nc1 <- nc/2

truC <- c(rep(0,nc0), rep(1, nc1))

sc <- list()

for(mi in 1:montc){
  
  tmp1 <- c(rnorm(nc0, mean = mu1, sd = 1),
  rnorm(nc1, mean =-mu1, sd = 1))
    
  tmp2 <- sapply(1:(ncDim-1), function(x) rcauchy(nc, location = 0, scale = 1))
  
  sc[[mi]] <- as.matrix(cbind(tmp1, tmp2))
}


qList <- 
foreach(i = 1:montc, .combine = 'rbind') %:%
foreach(j = 1:ncDim, .combine = 'rbind') %dopar% {
  dat <- sc[[i]][, 1:j]  

  qdaR <- qda(truC ~ dat)
  rcl <- as.numeric(predict(qdaR)$class) - 1
        
  qdaD <- qda(truC ~ dat, CV = TRUE)
        
  (Lr <- sum(rcl != truC)/length(truC))
  (Ld <- sum((as.numeric(qdaD$class) - 1) != truC)/length(truC))
  cbind(Lr, Ld, id = i, dim = j)
  }

m1 <- melt(qList[, 1:2])

bigMelt <- as.data.frame(cbind(data.table::melt(bigD)))

tmp1$Var1 <- as.factor(rep(1:mdim, each = mont))
tmp1$Var2 <- as.factor(tmp1$Var2)
names(tmp1) <- c("dim", "L", "value")
p1sig <- ggplot(data = tmp1, aes(x = L, y = value, fill = dim)) + 
     geom_boxplot(notch = TRUE) + facet_grid(. ~ dim)



ggplot(m1, aes(x = as.factor(Var2), y = value)) + 
  geom_boxplot()