boxplot for only the outliers - r

Greeting
I would only like to plot the outliers for boxplot
this is my solution but it does not seem to be very efficient or elegant.
Any packages or better code for doing that.
As you can see I am calling boxplot twice to do this
So if my dataset is very big than it will be bad
Thanks
set.seed(1501)
y <- c(4, 0, 7, -5, rnorm(16))
x1 <- c("a", "a", "b", "b", sample(letters[1:5], 16, T))
lab_y <- sample(letters, 20)
datxx <- as.matrix(cbind(y,x1,lab_y))
boxplot_outlier<- function(dat){
bx <- boxplot(as.numeric(dat[,"y"]) ~ dat[,"x1"])
out_label <- c()
for ( i in seq(bx$out)){
out_label[i] <- dat[which(dat[,"y"]==bx$out[i]),"lab_y"]
}
out_label
out_g <- c()
for ( i in seq(bx$out)){
out_g[i] <- dat[which(dat[,"y"]==bx$out[i]),"x1"]
}
out_g
out_y <- c()
for ( i in seq(bx$out)){
out_y[i] <- dat[which(dat[,"y"]==bx$out[i]),"y"]
}
out_y
out_all<-cbind(out_y,out_g,out_label)
out_all <- as.matrix(out_all)
out_g <- as.matrix(out_g)
colnames(out_g)[1]<-"x1"
out_g_x <- out_g[which(!duplicated(out_g[,"x1"]))]
out_g_x <- as.matrix(out_g_x)
colnames(out_g_x)[1]<-"x1"
datsub <- merge(dat,out_g_x,by=c("x1"))
datsub <- as.matrix(datsub)
bx2 <- boxplot(as.numeric(datsub[,"y"]) ~ datsub[,"x1"],data=datsub)
mynum <- cbind(as.numeric(c(1:nrow(out_g_x))),out_g_x)
mynumxx <- merge(x=out_g,y=mynum,by=c("x1"))
colnames(mynumxx)[2]<-"v1"
text(as.numeric(mynumxx[,"v1"])+0.2,as.numeric(out_all[,"out_y"]),out_all[,"out_label"])
}
boxplot_outlier(datxx)

You could use ggplot2 to plot and set the box and lines to a fully transparent colour. Note that you have to put your data into a data.frame for this, which is better anyway, since y is converted to character in a matrix with the other variables.
dat <- data.frame(y,x1,lab_y)
ggplot(as.data.frame(dat), aes(x=x1,y=y)) + geom_boxplot(fill="#00000000",colour="#00000000")

Related

Loop a function in r to create a new table

I have a dataframe in r and want to perform the levene's/ variance test on multiple variables with two groups and save all results in a table. I have tried to do this using a for() loop and sapply() but I get neither working:
df <- data.frame(
x = rnorm(100, 0, 1),
y = rnorm(100, 50, 1),
z = rnorm(100, 70, 2),
group = rep(c(0,1), each = 50)
)
varlist <- c("x","y","z")
res.var <- character(length(varlist))
res.f <- numeric(length(varlist))
res.p <- numeric(length(varlist))
Option 1)
for(i in seq_along(varlist)) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
}
Option 2:
sapply(varlist, function(x) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
})
Maybe there's an easier way to that this. I'd be glad for any help ;o) Thank you in advance.

Faceting a plotly heatmap

I'd like to be able to facet an R plotly heatmap.
Here's what I mean:
I have a hierarchically-clustered gene expression dataset:
require(permute)
set.seed(1)
mat <- rbind(cbind(matrix(rnorm(2500,2,1),nrow=25,ncol=500),matrix(rnorm(2500,-2,1),nrow=25,ncol=500)),
cbind(matrix(rnorm(2500,-2,1),nrow=25,ncol=500),matrix(rnorm(2500,2,1),nrow=25,ncol=500)))
rownames(mat) <- paste("g",1:50,sep=".")
colnames(mat) <- paste("s",1:1000,sep=".")
hc.col <- hclust(dist(t(mat)))
dd.col <- as.dendrogram(hc.col)
col.order <- order.dendrogram(dd.col)
hc.row <- hclust(dist(mat))
dd.row <- as.dendrogram(hc.row)
row.order <- order.dendrogram(dd.row)
mat <- mat[row.order,col.order]
I then discretize it to specific expression ranges because that happens to help the resolution of colors for my case. I'm also creating other structures to help me plot the colorbar the way I want it to:
require(RColorBrewer)
mat.intervals <- cut(mat,breaks=6)
interval.mat <- matrix(mat.intervals,nrow=50,ncol=1000,dimnames=list(rownames(mat),colnames(mat)))
interval.cols <- brewer.pal(6,"Set2")
names(interval.cols) <- levels(mat.intervals)
require(reshape2)
interval.df <- reshape2::melt(interval.mat,varnames=c("gene","sample"),value.name="expr")
interval.cols2 <- rep(interval.cols, each=ncol(mat))
color.df <- data.frame(range=c(0:(2*length(interval.cols)-1)),colors=c(0:(2*length(interval.cols)-1)))
color.df <- setNames(data.frame(color.df$range,color.df$colors),NULL)
for (i in 1:(2*length(interval.cols))) {
color.df[[2]][[i]] <- interval.cols[[(i + 1) / 2]]
color.df[[1]][[i]] <- i/(2*length(interval.cols))-(i %% 2)/(2*length(interval.cols))
}
They way I generated the data I know that samples 1-500 are one cluster and samples 501:1000 are the other, so I label them:
interval.df$cluster <- NA
interval.df$cluster[which(interval.df$sample %in% paste("s",1:500,sep="."))] <- "A"
interval.df$cluster[which(interval.df$sample %in% paste("s",501:1000,sep="."))] <- "B"
I thought that adding a sample with not color and interval will create a white column in the heatmap plot that will look like a facet border:
divider.df <- data.frame(gene=unique(interval.df$gene),sample=NA,expr=NA,cluster=NA)
interval.df <- rbind(dplyr::filter(interval.df,cluster == "A"),divider.df,dplyr::filter(interval.df,cluster == "B"))
And now I try plotting:
#add ticks for each cluster
tick.vals <- c("s.158","s.617")
tick.text <- c("A","B")
require(plotly)
plot_ly(z=c(interval.df$expr),x=interval.df$sample,y=interval.df$gene,colors=interval.cols2,type="heatmap",colorscale=color.df,
colorbar=list(title="score",tickmode="array",tickvals=c(1:6),ticktext=names(interval.cols),len=0.2,outlinecolor="white",bordercolor="white",borderwidth=5,bgcolor="white")) %>%
layout(xaxis = list(title = 'Cluster',tickmode = 'array',tickvals = tick.vals,ticktext = tick.text))
But I don't see any separation between the clusters:
Any idea how to achieve such a facet border between the two clusters?
Your example is quite involved so I have reduced it down to a minimal example to focus on the gap you are looking for in the quadrants of your heatmap.
Modified from the examples on the plotly site, here.
library(plotly)
m <- matrix(rnorm(9), nrow = 3, ncol = 3)
p <- plot_ly(
x = c("a", "b", "c"), y = c("d", "e", "f"),
z = m, type = "heatmap"
)
subplot(p, p, p, p, shareX = TRUE, shareY = TRUE, nrows = 2)
If you create a plotly object for each of the quadrants and then use subplot, you will get a result looking similar to this:
N.B. I have cropped out the legend because it was duplicated for the facets, you could merge these into one.

R plotly heatmap tick labels being cutoff

Trying to produce a gene-expression heatmap using R's plotly. The gene names are quite long and the dimensions are quite big:
require(permute)
require(plotly)
set.seed(1)
mat <- matrix(shuffle(c(rnorm(5000,2,1),rnorm(5000,-2,1))),nrow=2500,ncol=4)
rownames(mat) <- paste("very_long_gene_name",1:2500,sep=".")
colnames(mat) <- paste("s",1:4,sep=".")
Clustering:
hc.col <- hclust(dist(t(mat)))
dd.col <- as.dendrogram(hc.col)
col.order <- order.dendrogram(dd.col)
hc.row <- hclust(dist(mat))
dd.row <- as.dendrogram(hc.row)
row.order <- order.dendrogram(dd.row)
mat <- mat[row.order,col.order]
Producing the plot and saving the html file:
heatmap.plotly <- plot_ly(x=colnames(mat),y=rownames(mat),z=mat,type="heatmap",colors=colorRamp(c("darkblue","white","darkred")))
htmlwidgets::saveWidget(heatmap.plotly,"heatmap.plotly.html")
The figure I get has the gene names cut off, and I'm not sure it's presenting all the data:
Any idea how to fix these two issues?
Increase your margins
m <- list(
l = 200,
r = 10,
b = 50,
t = 10,
pad = 2
)
heatmap.plotly <- plot_ly(x=colnames(mat),y=rownames(mat),z=mat,type="heatmap",
colors=colorRamp(c("darkblue","white","darkred"))) %>%
layout(margin = m)
heatmap.plotly

How to do calculations on elements from a sublist in R

my code is as follows:
x <- data.frame(matrix(rnorm(20), nrow=10))
colnames(x) <- c("z", "m")
n_boot<-4
bs <- list()
for (i in 1:n_boot) {
bs[[i]] <- x[sample(nrow(x), 10, replace = TRUE), ]
}
bt<-matrix(unlist(bs), ncol = 2*n_boot, byrow = FALSE)
colnames(bt) <- rep(c("z","m"),times=n_boot)
M_to_boot <- bt[,seq(2,8,by=2)]
funct<-function(M_boot_max) {
od<-(1/((10*((10^((16-M_boot_max-25)/5))^3)/3)*((max(M_boot_max)-min(M_boot_max))/50)))
}
V_boot<-apply(M_to_boot,2,funct)
rows.combined <- nrow(M_to_boot)
cols.combined <- ncol(M_to_boot) + ncol(V_boot)
matrix.combined <- matrix(NA, nrow=rows.combined, ncol=cols.combined)
matrix.combined[, seq(1, cols.combined, 2)] <- M_to_boot
matrix.combined[, seq(2, cols.combined, 2)] <- V_boot
colnames(matrix.combined) <- rep(c("M_boot","V_boot"),times=n_boot)
df<-as.data.frame(matrix.combined)
start0 <- seq(1, by = 2, length = ncol(df) / 2)
start <- lapply(start0, function(i, df) df[i:(i+1)], df = df)
tests<-lapply(start, function(xy) split(xy, cut(xy$M_boot,breaks=5)))
Now I want to prepare some calculations on values V_boot from a sublists. To be specific I want to for each subsample calculate the sum of V_boot. So, for example I want for a bin M_boot "[[4]]$(0.811,1.25]" to have a value of sum(V_boot) for that bin. But I cannot figure out how to get to that each V_boot values.
Please help me.

R: gWidgets: gText: add a search/find function

I'd like to ask if there is a way of adding a search/find function in gtext gwidget.
x <- c(1, 2)
y <- c(3, 4)
z <- c(5, 6)
df <- data.frame(x, y, z)
df.co <- capture.output(df) # get df as text
str.split <- strsplit(df.co, "\\s+") # split every line in its components
w1 <- gwindow()
gt1 <- gtext(container=w1)
insert(gt1, df.co)

Resources