1 Unsupervised RandomForest

1.1 U-Rerf

set.seed(10301)
simMat <- createSimilarityMatrix(as.matrix(sdat), 500, floor(nrow(sdat)*0.23))
## ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
image(simMat[po,po], col = viridis(255))
title('Similarity Matrix')

disMat <- createDistanceMatrix(as.matrix(sdat), 500, floor(nrow(sdat)*0.23))
## ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
image(disMat[po,po], col = viridis(255))
title('Distance Matrix')

eigM <- eigen(simMat)
eio <- order(eigM$values)
pcU <- prcomp(simMat)
el1 <- getElbows(pcU$sdev, plot= FALSE)[1]
el2 <- getElbows(pcU$sdev, plot= FALSE)[2]

urerfDat <- pcU$x[, 1:el1]

1.2 U-Rerf Pairs

pairs(urerfDat[,1:7], col = gaba + 1, pch = 20, cex = 0.2)

2 U-Rerf Results

2.1 1-d Heatmap

2.2 Location meda_plots

2.3 Outliers as given by randomForest

2.4 Correlation Matrix

2.5 Cumulative Variance with Elbows

2.6 Paired Hex-binned plot

2.7 Hierarchical GMM Classifications

2.8 Hierarchical GMM Dendrogram

2.9 Stacked Means

2.10 Cluster Means

2.11 Similarity Matrix ordered by hGMM

utmp <- order(Lu[[7]]$dat$labels$col)
image(simMat[utmp,utmp], col = viridis(255))
title("Similarity Matrix ordered by hGMM to level 4")

3 U-Rerf: Restricting hGMM to \(K = 2\)

Here we are restricting hierarchical GMM to only go through on level. We are comparing the cluster results to the gaba labels.

set.seed(3144)
uh2 <- hmc(urerfDat, maxDepth = 2, ccol = ccol)
uh3 <- hmc(urerfDat, maxDepth = 3, ccol = ccol)
uh2lab <- viridis(max(uh2$dat$labels$col))
uh2col <- uh2$dat$labels$col
poh <- order(uh2col)

3.1 Similarity Matrix ordered by hGMM

image(simMat[poh,poh], col = viridis(255))
title("Similarity Matrix ordered by hGMM level 2")

3.2 K = 2 stacked means plot

p1 <- stackM(uh3, ccol = "black", centered = TRUE, depth = 3)
p2 <- stackMraw(sdat, uh3$dat$labels$L1, centered = TRUE, depth = 1, ccol = ccol)
grid.arrange(p1,p2, nrow=1)

3.3 Pairs plot colored by true gaba classification

cols <- c("black", "magenta")[gabaID$gaba+1]
acols <- alpha(cols, 0.35)
#pairs(h2$dat$data, pch = 19, cex = 0.7, col = acols)
plot(uh2$dat$data, col = acols, pch = c(19,3)[gaba+1], cex = c(0.5,1)[gaba+1])

pairs(sdat, col = acols, pch = c(19,3)[gaba+1], cex = c(0.5,1)[gaba+1])

3.4 Pairs plot colored by hGMM cluster classification

acols2 <- alpha(uh2lab[uh2$dat$labels$col], 0.45)
par(bg = "gray45")
plot(uh2$dat$data, pch = c(3,20)[gaba + 1], cex = 1, col = acols2)

pairs(sdat, pch = 19, cex = 0.7, col = acols2)

dev.off()
## null device 
##           1

3.5 Permutation test for ARI

p0 <- mclust::adjustedRandIndex(upred, gaba)
uperms <- foreach(i = 1:1.5e4, .combine = c) %dopar% {
  set.seed(i*2)
  mclust::adjustedRandIndex(sample(upred), gaba)
}
pv0 <- sum(c(uperms,p0) >= p0)/length(uperms)
hist(uperms, xlim = c(min(p0, range(uperms)[1]), max(p0, range(uperms)[2])),
     main = "permutation test of ARI values", probability = TRUE)
abline(v = p0, col = 'red')

t1
##        truth
## pred    FALSE TRUE
##   FALSE   443   70
##   TRUE    265   15

3.6 Summary Table

measurment value
Misclassification Rate 0.4224464
Accuracy 0.5775536
Sensitivity 0.1764706
Specificity 0.6257062
Precision 0.0535714
Recall 0.1764706
ARI -0.0313515
\(p\)-value for ARI 1.000067
F1-score 0.0821918
TP 15
FP 265
TN 443
FN 70

4 Random Forest

rf1 <- randomForest(sdat, proximity = TRUE)
kern <- rf1$prox

pc1 <- prcomp(kern)
el1 <- getElbows(pc1$sdev, plot = TRUE)[1]

el2 <- getElbows(pc1$sdev, plot = FALSE)[2]

#edat <- scale(pc1$x[,1:el1], center = TRUE, scale = TRUE)
edat <- pc1$x[, 1:el1]
pairs(edat, col = gaba + 1, pch = 20, cex = 0.5)

4.1 Kernel/Proximity Matrix in heatmap

image(kern[po,po], col = viridis(255))

5 Results

5.1 1-d Heatmap

5.2 Location meda_plots

5.3 Outliers as given by randomForest

5.4 Correlation Matrix

5.5 Cumulative Variance with Elbows

5.6 Paired Hex-binned plot

5.7 Hierarchical GMM Classifications

5.8 Hierarchical GMM Dendrogram

5.9 Stacked Means

5.10 Cluster Means

5.11 Similarity Matrix ordered by hGMM

tmp <- order(Lt[[7]]$dat$labels$col)
image(kern[tmp,tmp], col = viridis(255))
title("Proximity Matrix ordered by hGMM to level 4")

6 Restricting hGMM to \(K = 2\)

Here we are restricting hierarchical GMM to only go through on level. We are comparing the cluster results to the gaba labels.

set.seed(3144)
h2 <- hmc(edat, maxDepth = 3, ccol = ccol)
h2lab <- viridis(max(h2$dat$labels$col))
h2col <- h2$dat$labels$col
poh1 <- order(h2col)

6.1 Similarity Matrix ordered by hGMM

image(kern[poh,poh], col = viridis(255))
title("Proximity Matrix ordered by hGMM clusters")

6.2 K = 2 stacked means plot

p1 <- stackM(h2, ccol = "black", centered = TRUE, depth = 3)
p2 <- stackMraw(sdat, h2$dat$labels$L1, centered = TRUE, depth = 1, ccol = ccol)
grid.arrange(p1,p2, nrow=1)

6.3 Pairs plot colored by true gaba classification

cols <- c("black", "magenta")[gabaID$gaba+1]
acols <- alpha(cols, 0.35)
#pairs(h2$dat$data, pch = 19, cex = 0.7, col = acols)
plot(h2$dat$data, col = acols, pch = c(19,3)[gaba+1], cex = c(0.5,1)[gaba+1])

pairs(sdat, col = acols, pch = c(19,3)[gaba+1], cex = c(0.5,1)[gaba+1])

6.4 Pairs plot colored by hGMM cluster classification

acols2 <- alpha(h2lab[h2$dat$labels$col], 0.45)
par(bg = "gray45")
plot(h2$dat$data, pch = c(3,20)[gaba + 1], cex = 1, col = acols2)

pairs(sdat, pch = 19, cex = 0.7, col = acols2)

dev.off()
## null device 
##           1

7 Permutation test for ARI

p0 <- mclust::adjustedRandIndex(pred, gaba)
perms <- foreach(i = 1:1.5e4, .combine = c) %dopar% {
  set.seed(i*2)
  mclust::adjustedRandIndex(sample(pred), gaba)
}
pv0 <- sum(c(perms,p0) >= p0)/length(perms)
hist(perms, xlim = c(min(p0, range(perms)[1]), max(p0, range(perms)[2])),
     main = "permutation test of ARI values", probability = TRUE)
#hist(perms, probability = TRUE)
abline(v = p0, col = 'red')

t1
##        truth
## pred    FALSE TRUE
##   FALSE   311   20
##   TRUE    397   65

8 Summary Table

measurment value
Misclassification Rate 0.5258512
Accuracy 0.4741488
Sensitivity 0.7647059
Specificity 0.4392655
Precision 0.1406926
Recall 0.7647059
ARI 0.1198756
\(p\)-value for ARI 0.000067
F1-score 0.23766
TP 65
FP 397
TN 311
FN 20