Following from previous pages, this page will focus on filtering the data before clustering to explore if filtering improves the outcome of clustering.
The features defined below were found in BBSS13 (F0-F3) and via correspondance with the authors (F4-F5):
Let \(V\) be an 11x11x11 volume. For every \(i \in V\), let \(b_i\) denote the brightness of pixel \(i\) and \(d_i\) the pixelwise distance from \(i\) to the synaptic locus (which seems to be the center pixel).
Integrated Brightness \(:= \sum_{i\in V} b_i =: B\)F1
Local Brightness \(:= \sum_{i\in V} b_i/d_{i}^{2}\)F2
Center of Mass \(:= \sum_{i\in V} b_id_i/B\)F3
Moment of Inertia \(:= \sum_{i\in V} b_id{_i}^{2}/B\)The local maximum \(m\) in \(V\) for each channel is noted and a smaller 5x5x5 volume \(V'\) is created about \(m\).
Restricted Integrated Brightness \(:= \sum_{i\in V'} b_i\)F5
Distance between centers of \(V'\) and \(V\).Here we read in the data and select a random half of it for exploration.
featFull <- fread("../data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",
### Setting a seed and creating an index vector
### to select half of the data
half1 <- sample(dim(featFull)[1],dim(featFull)[1]/2)
half2 <- setdiff(1:dim(featFull)[1],half1)
feat <- featFull[half1,]
# [1] 559649 144
## Setting the channel names
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
## Setting the channel types
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
'other', 'ex.post','other','other','ex.post','none','none')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
## Createing factor variables for channel and channel type sorted properly
ffchannel <- (factor(channel.type,
levels= c("ex.pre", "ex.post", "in.pre",
"in.post", "in.pre.small", "other", "none")))
fchannel <- as.numeric(factor(channel.type,
levels= c("ex.pre", "ex.post", "in.pre",
"in.post", "in.pre.small",
"other", "none")))
ford <- order(fchannel)
## Setting up colors for channel types
Syncol <- c("#197300", "#5ed155", "#660000", "#cc0000", "#ff9933",
"mediumblue", "gold")
ccol <- Syncol[fchannel]
exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","mediumblue");
exCol <- as.character(exCol)
fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)
f <- lapply(1:6,function(x){seq(x,ncol(feat),by=nfeat)})
featF <- lapply(f,function(x){subset(feat,select=x)})
featF0 <- featF[[1]]
f01e3 <- 1e3*data.table(apply(X=featF0, 2,
fs <- f01e3
### Taking log_10 on data with 0's removed
ans <- apply(featF0, 1, function(row){ any(row == 0)})
logF0 <- round(log10(featF0[!ans,]), 2)
slogF0 <- logF0[,lapply(.SD,scale, center=TRUE,scale=TRUE)]
rFeat <- feat[,lapply(.SD, rank, ties.method='average')]
We now have the following data sets:
: The feature vector looking only at the integrated brightness features.fs
: The feature vector scaled between \([0,1000]\).logF0
: The feature vector, with 0’s removed, then \(log_{10}\) is applied.slogF0
: The feature vector, with 0’s removed, then \(log_{10}\), then scaled by subtracting the mean and dividing by the sample standard deviation.synF <- feat[, grep("Synap_", names(feat)),with=FALSE]
lsynF <- synF[,lapply(.SD,function(x){scale(log10(x+1),center=TRUE,scale=TRUE)})]
synF <- synF[, lapply(.SD,
qs <- quantile(x, probs=c(0.01,0.99))
x[x < qs[1]] <- NA
x[x > qs[2]] <- NA
lsynF <- lsynF[, lapply(.SD,
qs <- quantile(x, probs=c(0.01,0.99), na.rm=TRUE)
x[x < qs[1]] <- NA
x[x > qs[2]] <- NA
names(synF) <- paste0(names(synF), "_linear")
names(lsynF) <- paste0(names(lsynF), "_logscale")
vglutF <- feat[,grep("VGlut1_t",names(feat)),with=FALSE]
lvglutF <- vglutF[,lapply(.SD,function(x){scale(log10(x+1),center=TRUE,scale=TRUE)})]
vglutF <- vglutF[, lapply(.SD,
qs <- quantile(x, probs=c(0.01,0.99))
x[x < qs[1]] <- NA
x[x > qs[2]] <- NA
lvglutF <- lvglutF[, lapply(.SD,
qs <- quantile(x, probs=c(0.01,0.99), na.rm=TRUE)
x[x < qs[1]] <- NA
x[x > qs[2]] <- NA
names(vglutF) <- paste0(names(vglutF), "_linear")
names(lvglutF) <- paste0(names(lvglutF),"_logscale")
df1 <- melt(as.matrix(cbind(synF,lsynF)))
ggplot(data=df1,aes(x=value,y=..density..,group=as.factor(Var2),colour=Var2)) +
geom_density(size = 1.5) +
facet_wrap( ~ Var2,scales='free',ncol=6) +
guides(col = guide_legend(ncol=1))
df2 <- melt(as.matrix(cbind(vglutF,lvglutF)))
ggplot(data=df2,aes(x=value,y=..density..,group=as.factor(Var2),colour=Var2)) +
geom_density(size = 1.5) +
facet_wrap( ~ Var2,scales='free', ncol=6) +
guides(col = guide_legend(ncol=1))
synF <- feat[, grep("Synap_", names(feat)),with=FALSE]
ans1 <- apply(synF, 1, function(row){ any(row == 0)})
lsynF <- synF[!ans1, lapply(.SD,
rsynF <- synF[,lapply(.SD, rank, ties.method='average')]
print(paste("removed", sum(ans1), "zero entries"))
# [1] "removed 222681 zero entries"
The following block needs to be re-written.
gg1 <- list()
ind <- matrix(c(1:12), ncol=2)
rownames(ind) <- paste0("F", 0:5)
cols <- colorRampPalette(c("darkgreen", "chartreuse"))(10)
cols.pal <- colorRampPalette(c("white", "darkgreen", "chartreuse"))
for ( i in c(1:6)) {
tmp1 <- synF[,ind[i,], with=FALSE]
tmp2 <- lsynF[,ind[i,], with=FALSE]
tmp3 <- rsynF[,ind[i,], with=FALSE]
gg1[[i]] <- list()
gg1[[i]][[1]] <- ggplot(data=tmp1,aes_string(x=names(tmp1)[1], y=names(tmp1)[2])) +
geom_hex(bins=200,aes(fill=log10(..value..))) +
geom_smooth(method='lm',colour='red', alpha=0.5)+
ggtitle(paste0("Untransformed Feature:", rownames(ind)[i])) +
scale_fill_gradientn(guide=guide_colorbar("Count on \nlog_10 scale"),
gg1[[i]][[2]] <-
ggplot(data=tmp2,aes_string(x=names(tmp2)[1], y=names(tmp2)[2])) +
geom_hex(bins=200,aes(fill=log10(..value..))) +
geom_smooth(method='lm',colour='red', alpha=0.5)+
ggtitle(paste0("Scale log Transformed Feature:", rownames(ind)[i])) +
scale_fill_gradientn(guide=guide_colorbar("Count on \nlog_10 scale"),
colours= cols )
gg1[[i]][[3]] <-
ggplot(data=tmp3,aes_string(x=names(tmp3)[1], y=names(tmp3)[2])) +
geom_hex(bins=200,aes(fill=log10(..value..))) +
#geom_point(alpha=0.2) +
geom_smooth(method='lm',colour='red', alpha=0.5)+
ggtitle(paste0("Rank Transformed Feature:", rownames(ind)[i])) +
scale_fill_gradientn(guide=guide_colorbar("Count on \nlog_10 scale"),
ggS <- Reduce("c", gg1)
do.call("grid.arrange",args=c(ggS[1:15], ncol=3))