1 PCA on annuli stats using \(L_1\) Norm

naL1 <- fread("rorb_annuli_stats_GoogleDoc.csv", header = FALSE)
cnames <- colnames(naL1)
meta <- fread("GoogleDocData.csv")
loc <- fread("GoogleDocData.csv")[, 1:3]
ids <- as.data.frame(read.csv("GoogleDocData.csv")[, 4])
colnames(ids) <- 'id'

ccolL1 <- c('blue', 'blue', 'blue', 'red', 'red', 'red', 'black', 'black', 'green', 'green', 'black', 'green')
ccolL1 <-  rep(ccolL1, each = 15)

CHAN_NAMES <- rep(c('DAPI1', 'DAPI2', 'DAPI3', 'GABA', 'GAD2', 'Gephyrin', 'GluN1', 'MBP', 'PSD95', 'synapsin', 'TdTomato', 'VGlut1'), each = 15)

colnames(naL1) <- paste0(CHAN_NAMES,"_", sprintf("%02d",1:15))
snaL1 <- scale(naL1, center = TRUE, scale = TRUE)
pc1 <- princomp(snaL1)
gaba <- as.factor(meta$gaba)

1.1 LDA and QDA

Next, applying LDA and QDA iterating on the number of PCA dimension included.

y <- as.factor(gaba)

ldaL1 <- foreach(di = 1:ncol(snaL1)) %dopar% {
           x <- as.data.frame(pc1$scores[, 1:di])
           ldatmp <- lda(y ~ ., data = x, CV = FALSE)
           ldatmp
}

qdaL1 <- foreach(di = 1:ncol(snaL1)) %dopar% {
           x <- as.data.frame(pc1$scores[, 1:di])
           qdatmp <- 
             tryCatch(qda(y ~ ., data = x, CV = FALSE),
                      error=function(err) NA)
           qdatmp
}

rfL1 <- foreach(di = 1:ncol(snaL1)) %dopar% {
           tmpdat <- data.table(gaba = gaba, pc1$scores[, 1:di])
           rftmp <- randomForest(gaba ~ ., data = tmpdat, prox = TRUE)
           rftmp
}

#saveRDS(list(ldaL1, qdaL1, rfL1), file = "models_lda_qda_rf.rdat")

ll <- sapply(ldaL1, function(x) sum(predict(x)$class != gaba)/length(gaba))
ql <- sapply(qdaL1[!is.na(qdaL1)], function(x) sum(predict(x)$class != gaba)/length(gaba))
rl <- sapply(rfL1,   function(x) sum(predict(x) != gaba)/length(gaba))
plot(x = 1:ncol(pc1$scores), y = seq(0,max(ll,ql,rl),length = ncol(pc1$scores)), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(ll, type = 'b', col = 'blue', pch = 20, cex = 0.5)
points(ql, type = 'b', col = 'orange', pch = 15, cex = 0.5)
points(rl, type = 'b', col = 'darkgreen', lty = 2, pch = 24, cex = 0.2)
abline(h = chance <- sum(gaba == 1)/length(gaba), col = 'magenta', lty=4)
text(which.min(ql),0, label = paste("qda", min(round(ql, 3))), col = 'orange')
text(which.min(ll), 0, label = paste("lda", min(round(ll,3))), col = 'blue')
text(which.min(rl), 0, label = paste("rf", min(round(rl,3))), col = 'darkgreen')
text(ncol(snaL1)/4, chance, label = "chance", col = 'magenta', pos = 3)

1.2 On permutations of the labels

set.seed(1030)
yp <- sample(as.factor(gaba))

ldaL1prem <- foreach(di = 1:ncol(snaL1)) %dopar% {
           x <- as.data.frame(pc1$scores[, 1:di])
           ldatmp <- lda(yp ~ ., data = x, CV = FALSE)
           ldatmp
}

qdaL1prem <- foreach(di = 1:ncol(snaL1)) %dopar% {
           x <- as.data.frame(pc1$scores[, 1:di])
           qdatmp <- 
             tryCatch(qda(yp ~ ., data = x, CV = FALSE),
                      error=function(err) NA)
           qdatmp
}

rfL1prem <- foreach(di = 1:ncol(snaL1)) %dopar% {
           tmpdat <- data.table(yp = yp, pc1$scores[, 1:di])
           rftmp <- randomForest(yp ~ ., data = tmpdat, prox = TRUE)
           rftmp
}

#saveRDS(list(ldaL1prem, qdaL1prem, rfL1prem), file = "models_lda_qda_rf.rdat")

llperm <- sapply(ldaL1prem, function(x) sum(predict(x)$class != yp)/length(yp))
qlperm <- sapply(qdaL1prem[!is.na(qdaL1prem)], function(x) sum(predict(x)$class != yp)/length(yp))
rlperm <- sapply(rfL1prem,   function(x) sum(predict(x) != yp)/length(yp))
plot(x = 1:ncol(pc1$scores), y = seq(0,max(llperm,qlperm,rlperm),length = ncol(pc1$scores)), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(llperm, type = 'b', col = 'blue', pch = 20, cex = 0.5)
points(qlperm, type = 'b', col = 'orange', pch = 15, cex = 0.5)
points(rlperm, type = 'b', col = 'darkgreen', lty = 2, pch = 24, cex = 0.2)
abline(h = chance <- sum(yp == 1)/length(yp), col = 'magenta', lty=4)
text(which.min(qlperm),0, label = paste("qda", min(round(qlperm, 3))), col = 'orange')
text(which.min(llperm), 0, label = paste("lda",min(round(llperm,3))), col = 'blue')
text(which.min(rlperm), 0, label = paste("rf", min(round(rlperm,3))), col = 'darkgreen')
text(ncol(snaL1)/4, chance, label = "chance", col = 'magenta', pos = 3)
title("On permuted labels")

2 PCA on annuli stats using weighted \(L_1\) Norm

naL1W <- fread("rorb_annuli_stats_GoogleDocL1W.csv", header = FALSE)

ccolL1 <- c('blue', 'blue', 'blue', 'red', 'red', 'red', 'black', 'black', 'green', 'green', 'black', 'green')
ccolL1 <-  rep(ccolL1, each = 15)

CHAN_NAMES <- rep(c('DAPI1', 'DAPI2', 'DAPI3', 'GABA', 'GAD2', 'Gephyrin', 'GluN1', 'MBP', 'PSD95', 'synapsin', 'TdTomato', 'VGlut1'), each = 15)

colnames(naL1W) <- paste0(CHAN_NAMES,"_", sprintf("%02d",1:15))
snaL1W <- scale(naL1W, center = TRUE, scale = TRUE)
pc1W <- princomp(snaL1W)

2.1 LDA and QDA on weighted \(L_1\)

ldaL1W <- foreach(di = 1:ncol(snaL1W)) %dopar% {
           x <- as.data.frame(pc1W$scores[, 1:di])
           ldatmp <- lda(y ~ ., data = x, CV = FALSE)
           ldatmp
}

qdaL1W <- foreach(di = 1:ncol(snaL1W)) %dopar% {
           x <- as.data.frame(pc1W$scores[, 1:di])
           qdatmp <- 
             tryCatch(qda(y ~ ., data = x, CV = FALSE),
                      error=function(err) NA)
           qdatmp
}

rfL1W <- foreach(di = 1:ncol(snaL1W)) %dopar% {
           tmpdat <- data.table(gaba = gaba, pc1W$scores[, 1:di])
           rftmp <- randomForest(gaba ~ ., data = tmpdat, prox = TRUE)
           rftmp
}

#saveRDS(list(ldaL1W, qdaL1W, rfL1W), file = "models_lda_qda_rf.rdat")

llW <- sapply(ldaL1W, function(x) sum(predict(x)$class != gaba)/length(gaba))
qlW <- sapply(qdaL1W[!is.na(qdaL1W)], function(x) sum(predict(x)$class != gaba)/length(gaba))
rlW <- sapply(rfL1W,   function(x) sum(predict(x) != gaba)/length(gaba))
plot(x = 1:ncol(pc1W$scores), y = seq(0,max(llW,qlW,rlW),length = ncol(pc1W$scores)), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(llW, type = 'b', col = 'blue', pch = 20, cex = 0.5)
points(qlW, type = 'b', col = 'orange', pch = 15, cex = 0.5)
points(rlW, type = 'b', col = 'darkgreen', lty = 2, pch = 24, cex = 0.2)
abline(h = chance <- sum(gaba == 1)/length(gaba), col = 'magenta', lty=4)
text(which.min(qlW),0, label = paste("qda", min(round(qlW, 3))), col = 'orange')
text(which.min(llW), 0, label = paste("lda",min(round(llW,3))), col = 'blue')
text(which.min(rlW), 0, label = paste("rf", min(round(rlW,3))), col = 'darkgreen')
text(ncol(snaL1W)/4, chance, label = "chance", col = 'magenta', pos = 3)
title("Run on weighted L_1 data")

2.2 PCA on annuli stats using \(L_{\infty}\) Norm

naInf <- fread("rorb_annuli_stats_GoogleDocInf.csv", header = FALSE)
cnames <- colnames(naInf)
meta <- fread("GoogleDocData.csv")
loc <- fread("GoogleDocData.csv")[, 1:3]
ids <- as.data.frame(read.csv("GoogleDocData.csv")[, 4])
colnames(ids) <- 'id'

ccolLinf <- c('blue', 'blue', 'blue', 'red', 'red', 'red', 'black', 'black', 'green', 'green', 'black', 'green')
ccolLinf <-  rep(ccolLinf, each = 5)

CHAN_NAMES <- rep(c('DAPI1', 'DAPI2', 'DAPI3', 'GABA', 'GAD2', 'Gephyrin', 'GluN1',
                'MBP', 'PSD95', 'synapsin', 'TdTomato', 'VGlut1'), each = 5)
colnames(naInf) <- paste0(CHAN_NAMES,"_", sprintf("%d",1:5))

## annotation ids in the BOSS are +1 from the ids in Forrest's gsheet.
#annoSizes <- read.csv('annotation_sizes_pixels.csv')
#annoSizes$id <- annoSizes$id - 1
#
#tmp <- merge(ids, annoSizes, by = "id", all.x = TRUE)
snaLinf <- scale(naInf, center = TRUE, scale = TRUE)
pcInf <- princomp(snaLinf)
gaba <- as.factor(meta$gaba)

2.3 LDA and QDA

Next, applying LDA and QDA iterating on the number of PCA dimension included.

ldaLinf <- foreach(x = 1:ncol(snaLinf)) %dopar% {
           ldatmp <- lda(gaba ~ pcInf$scores[, 1:x], CV = FALSE)
           ldatmp
}

qdaLinf <- foreach(x = 1:ncol(snaLinf)) %dopar% {
           qdatmp <- 
             tryCatch(qda(as.factor(gaba) ~ pcInf$scores[, 1:x], CV = FALSE),
                      error=function(err) NA)
           qdatmp
}

rfLinf <- foreach(x = 1:ncol(snaLinf)) %dopar% {
           tmpdat <- data.table(gaba = gaba, pcInf$scores[, 1:x])
           rftmp <- randomForest(gaba ~ ., data = tmpdat, prox = TRUE)
           rftmp
}

ll <- sapply(ldaLinf, function(x) sum(predict(x)$class != gaba)/length(gaba))
ql <- sapply(qdaLinf, function(x) sum(predict(x)$class != gaba)/length(gaba))
rl <- sapply(rfLinf,  function(x) sum(predict(x) != gaba)/length(gaba))
plot(x = 1:ncol(pcInf$scores), y = seq(0,max(ll,ql,rl),length = ncol(pcInf$scores)), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(ll, type = 'b', col = 'blue', pch = 20, cex = 0.5)
points(ql, type = 'b', col = 'orange', pch = 15, cex = 0.5)
points(rl, type = 'b', col = 'darkgreen', lty = 2, pch = 24, cex = 0.2)
abline(h = chance <- sum(gaba == 1)/length(gaba), col = 'magenta', lty=4)
text(40,min(ql), label = paste("qda", min(round(ql, 3))), col = 'orange', pos = 4)
text(ncol(snaLinf)/2, min(ll), label = paste("lda", min(round(ll,3))), col = 'blue', pos = 1)
text(ncol(snaLinf)/3, min(rl), label = paste("rf", min(round(rl,3))), col = 'darkgreen')
text(ncol(snaLinf)/4, chance, label = "chance", col = 'magenta', pos = 3)

3 Results for \(L_1\)

3.1 1-d Heatmap

3.2 Location meda_plots

3.3 Outliers as given by randomForest

3.4 Correlation Matrix

3.5 Cumulative Variance with Elbows

3.6 Paired Hex-binned plot

3.7 Hierarchical GMM Classifications

3.8 Hierarchical GMM Dendrogram

3.9 Stacked Means

3.10 Cluster Means

3.11 MEDA on Annuli stats using \(L_{\infty}\) norm.

4 Results for \(L_{\infty}\)

4.1 1-d Heatmap

4.2 Location meda_plots

4.3 Outliers as given by randomForest

4.4 Correlation Matrix

4.5 Cumulative Variance with Elbows

4.6 Paired Hex-binned plot

4.7 Hierarchical GMM Classifications

4.8 Hierarchical GMM Dendrogram

4.9 Stacked Means

4.10 Cluster Means

5 Subset only the gaba dimensions

namesSub <- grep("GABA", names(naL1))
subGABA <- data.table(as.matrix(snaL1)[, namesSub])
pc1sub <- princomp(subGABA)

5.1 LDA and QDA on gaba dimensions

y <- as.factor(gaba)

ldaL1subGABA <- foreach(di = 1:ncol(subGABA)) %dopar% {
           x <- as.data.frame(pc1sub$scores[, 1:di])
           ldatmp <- lda(y ~ ., data = x, CV = FALSE)
           ldatmp
}

qdaL1subGABA <- foreach(di = 1:ncol(subGABA)) %dopar% {
           x <- as.data.frame(pc1sub$scores[, 1:di])
           qdatmp <- 
             tryCatch(qda(y ~ ., data = x, CV = FALSE),
                      error=function(err) NA)
           qdatmp
}

rfL1subGABA <- foreach(di = 1:ncol(subGABA)) %dopar% {
           tmpdat <- data.table(gaba = gaba, pc1sub$scores[, 1:di])
           rftmp <- randomForest(gaba ~ ., data = tmpdat, prox = TRUE)
           rftmp
}

#saveRDS(list(ldaL1subGABA, qdaL1subGABA, rfL1subGABA), file = "models_lda_qda_rf.rdat")

llsub <- sapply(ldaL1subGABA, function(x) sum(predict(x)$class != gaba)/length(gaba))
qlsub <- sapply(qdaL1subGABA[!is.na(qdaL1subGABA)], function(x) sum(predict(x)$class != gaba)/length(gaba))
rlsub <- sapply(rfL1subGABA,   function(x) sum(predict(x) != gaba)/length(gaba))
plot(x = 1:ncol(pc1sub$scores), y = seq(0,max(ll,ql,rl),length = ncol(pc1sub$scores)), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(llsub, type = 'b', col = 'blue', pch = 20, cex = 1)
points(qlsub, type = 'b', col = 'orange', pch = 15, cex = 1)
points(rlsub, type = 'b', col = 'darkgreen', lty = 2, pch = 24, cex = 1)
abline(h = chance <- sum(gaba == 1)/length(gaba), col = 'magenta', lty=4)
text(which.min(qlsub),min(qlsub), label = paste("qda", min(round(qlsub, 3))), col = 'orange', pos = 4)
text(which.min(llsub), min(llsub), label = paste("lda", min(round(llsub,3))), col = 'blue', pos = 1)
text(which.min(rlsub), min(rlsub), label = paste("rf", min(round(rlsub,3))), col = 'darkgreen', pos = 3)
text(ncol(subGABA)/4, chance, label = "chance", col = 'magenta', pos = 3)
title("resubstitution error on GABA only dimensions.")