Benchmark for several sample sizes - r

I know how to compare several solutions in terms of performance with mircrobenchmark. But quite often I need to do it over several sample sizes, e.g. 10^4, 10^5, 10^6, 10^7 and box plot results with sample size on the x axis.
I have got this code that does the lines for mean:
set.seed(1001)
N <- c(3, 4, 5, 6 ,7)
n <- 10^N
res <- lapply(n, function(x) {
d <- sample(1:x/10, 5 * x, replace=T)
d <- c(d, sample(d, x, replace=T)) # ensure many duplicates
dt <- data.table(d)
mb <- microbenchmark::microbenchmark(
"duplicated(original)" = d[!(duplicated(d) | duplicated(d, fromLast=TRUE))],
"tabulate" = { ud = unique(d); ud[tabulate(match(d, ud)) == 1L] },
"data.table" = dt[, count:= .N, by = d][count == 1]$d,
times = 1,unit = "ms")
sm <- summary(mb)[, c(1, 4, 8)]
sm$size = x
return(sm)
})
res <- do.call("rbind", res)
require(ggplot2)
##The values Year, Value, School_ID are
##inherited by the geoms
ggplot(res, aes(x = res$size, y = res$mean, colour=res$exp)) +
geom_line() + scale_x_log10() + scale_y_log10() +
geom_point()
By it would be much nicer to have box plot if times parameters of microbenchmark is not one. Is there an elegant way to achieve this?

Related

R: ggplot2 and loop with data.table

I have to make 4 plots which differ only for y and ylab.
I start from a data.table dt which is
set.seed(123)
dt <- data.table(a = rnorm(20),
b = rnorm(20),
c = rnorm(20),
d = rnorm(20),
e = rnorm(20))
Every single plot should be a scatterplot with row numbers as x vs y values. Additionally, I want to plot some hline at median(y) + h*mad(y) where h = c(0, -2, 2, -3, 3)
This plot should be repeated for columns a, c, d and e of dt.
I came up with this bit of code
# Defining y labels #
ylabels <- c(bquote(phantom(.)^100*A~"/"*phantom(.)^200*A),
bquote(phantom(.)^101*C~"/"*phantom(.)^201*B),
bquote(phantom(.)^102*D~"/"*phantom(.)^202*D),
bquote(phantom(.)^103*E~"/"*phantom(.)^202*E))
# Selecting columns of dt
ydata <- names(dt)[c(1, 3, 4, 5)]
h <- c(0, -2, 2, -3, 3)
hcol <- c("#009E73", "#E69F00", "#E69F00", "red", "red")
# The for cycle should create the 4 plots and assign them to a list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(aes_string(yintercept = median(ydata[i]) +
h * mad(ydata[i])), color = hcol) +
xlab("Replicate") +
ylab(ylabels[i]) +
scale_x_continuous(breaks = seq(1, dt[,.N])))
plots[[i]] <- p1 # add each plot into plot list
}
Then plots will be fed to the multiplot function from Cookbook for R.
However my loop doesn't work properly because it fails to calculate the median and mad values.
Do you have any suggestions to make the code work?
# data.table with the median +- h* mad values
hline.values <- dt[, lapply(.SD, function(x) median(x) + h * mad(x)),
.SDcols = ydata]
# new empty list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(data = hline.values,
aes_string(yintercept = ydata[i])) +
# Axis labels and theme
xlab("Replicate") +
ylab(ylabels[[i]]) +
scale_x_continuous(breaks = seq(1, dt[, .N]))
plots[[i]] <- p1
}

ggplot, plotting in multiple pages with different rows

I wanted to make a graph using facet_wrap and plot it in different pages in a pdf file. I've read son many options, and this works:
R + ggplot: plotting over multiple pages
but only when you have the same rows in each page.
I have this demo data to try explain my case:
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
So, I have ID with different number of observations. I tried to make a vector with the number of observation in each ID (I want an ID per page), but it doesn't work.
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
slice2 = slice[!(slice > nrow(A2))]
A3 = A2[slice2,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
Could you help me?
Thanks in advances,
Nacho
I think you were overthinking trying to calculate your "slices". Maybe you want this?
Not entirely sure. If you only want one ID per page you don't need facet_wrap, and you will probably need to set the scale explicitly to keep it the same from page to page.
library(plyr)
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
# slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
# slice2 = slice[!(slice > nrow(A2))]
# A3 = A2[slice2,]
A3 = A2[A2$ID==i,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()

Order heatmap rows in ggplot2 facet plot

I'm having a problem with faceted heatmap rendering in ggplot2. The idea is that I have several elements (these are genes in the real life) and several experiments (F1 and F2 in the example below). Using the F1 experiment, I'm able to create class of elements/genes based on their mean expression (high, ..., moderate, ..., low). In the heatmap produced through the example below, I would like to order each elements in each class (01, 02, 03, 04) based on its mean expression value in F1. Unfortunately, the elements appear in alphabetic order. I would be very happy to get some hints...
Best
library(ggplot2)
library(reshape2)
set.seed(123)
# let's create a fake dataset
nb.experiment <- 4
n.row <- 200
n.col <- 5
d <- matrix(round(runif(n.row * n.col),2), nc=n.col)
colnames(d) <- sprintf("%02d", 1:5)
# These strings will be the row names of each heatmap
# in the subsequent facet plot
elements <- sample(replicate(n.row/2, rawToChar(as.raw(sample(65:90, 6, replace=T)))))
# let's create a data.frame d
d <- data.frame(d,
experiment = sort(rep(c("F1","F2"), n.row/2)),
elements= elements)
# For elements related to experiment F1
# we artificially produce a gradient of values that will
# create elements with increasing row means
d[d$experiment =="F1",1:5] <- round(sweep(d[d$experiment =="F1",1:5],
1,
seq(from=1, 10, length.out = 100),
"+"), 2)
# For elements related to experiment F2
# we artificially produce a gradient of values that will
# create elements with decreasing row means
d[d$experiment =="F2",1:5] <- round(sweep(d[d$experiment =="F2",1:5],
1,
seq(from=10, 1, length.out = 100),
"+"), 2)
#print(d[d$experiment =="F1",1:5])
# Now we split the dataset by experiments
d.split <- split(d, d$experiment)
# For all experiments, we order elements based on the mean expression signal in
# F1.
row.means.F1 <- rowMeans(d.split$F1[,1:5])
pos <- order(row.means.F1)
for(s in names(d.split)){
d.split[[s]] <- d.split[[s]][pos,]
}
# We create several classes of elements based on their
# mean expression signal in F1.
cuts <- cut(1:nrow(d.split$F1), nb.experiment)
levels(cuts) <- sprintf("%02d", 1:nb.experiment)
for(s in names(d.split)){
d.split[[s]] <- split(d.split[[s]], cuts)
}
# Data are melt (their is perhaps a better solution...)
# in order to use the ggplot function
dm <- melt(do.call('rbind',lapply(d.split, melt)), id.var=c( "experiment", "elements", "variable", "L1"))
dm <- dm[, -5]
colnames(dm) <- c("experiment","elements", "pos", "rowMeanClass", "exprs")
# Now we plot the data
p <- ggplot(dm, aes(x = pos, y = elements, fill = exprs))
p <- p + geom_raster()
p <- p + facet_wrap(~rowMeanClass +experiment , scales = "free", ncol = 2)
p <- p + theme_bw()
p <- p + theme(text = element_text(size=4))
p <- p + theme(text = element_text(family = "mono", face = "bold"))
ggsave("RPlot_test.jpeg", p)
Using your advises I was able to find a solution (which implies to clearly specify the order of levels for the 'elements' factor). Thank you #hrbrmstr (and all others).
NB: I only added few lines compare to the original code that are denoted below with 'Added: begin' and 'Added: end' flags.
library(ggplot2)
library(reshape2)
set.seed(123)
# let's create a fake dataset
nb.experiment <- 4
n.row <- 200
n.col <- 5
d <- matrix(round(runif(n.row * n.col),2), nc=n.col)
colnames(d) <- sprintf("%02d", 1:5)
# These strings will be the row names of each heatmap
# in the subsequent facet plot
elements <- sample(replicate(n.row/2, rawToChar(as.raw(sample(65:90, 6, replace=T)))))
# let's create a data.frame d
d <- data.frame(d,
experiment = sort(rep(c("F1","F2"), n.row/2)),
elements= elements)
# For elements related to experiment F1
# we artificially produce a gradient of values that will
# create elements with increasing row means
d[d$experiment =="F1",1:5] <- round(sweep(d[d$experiment =="F1",1:5],
1,
seq(from=1, 10, length.out = 100),
"+"), 2)
# For elements related to experiment F2
# we artificially produce a gradient of values that will
# create elements with decreasing row means
d[d$experiment =="F2",1:5] <- round(sweep(d[d$experiment =="F2",1:5],
1,
seq(from=10, 1, length.out = 100),
"+"), 2)
#print(d[d$experiment =="F1",1:5])
# Now we split the dataset by experiments
d.split <- split(d, d$experiment)
# For all experiments, we order elements based on the mean expression signal in
# F1.
row.means.F1 <- rowMeans(d.split$F1[,1:5])
pos <- order(row.means.F1)
for(s in names(d.split)){
d.split[[s]] <- d.split[[s]][pos,]
}
## Added: begin ###
#Get the list of elements in proper order (based on row mean)
mean.order <- as.character(d.split$F1$elements)
## Added: end###
# We create several classes of elements based on their
# mean expression signal in F1.
cuts <- cut(1:nrow(d.split$F1), nb.experiment)
levels(cuts) <- sprintf("%02d", 1:nb.experiment)
for(s in names(d.split)){
d.split[[s]] <- split(d.split[[s]], cuts)
}
# Data are melt (their is perhaps a better solution...)
# in order to use the ggplot function
dm <- melt(do.call('rbind',lapply(d.split, melt)), id.var=c( "experiment", "elements", "variable", "L1"))
dm <- dm[, -5]
colnames(dm) <- c("experiment","elements", "pos", "rowMeanClass", "exprs")
## Added: begin###
#Ensure that dm$elements is an ordered factor with levels
# ordered as expected
dm$elements <- factor(dm$elements, levels = mean.order, ordered = TRUE)
## Added: end###
# Now we plot the data
p <- ggplot(dm, aes(x = pos, y = elements, fill = exprs))
p <- p + geom_raster()
p <- p + facet_wrap(~rowMeanClass +experiment , scales = "free", ncol = 2)
p <- p + theme_bw()
p <- p + theme(text = element_text(size=4))
p <- p + theme(text = element_text(family = "mono", face = "bold"))
ggsave("RPlot_test.jpeg", p)

how to identify where of a column is more rich in one values?

If i generate randomly a binary data frame like below
Mat <- matrix(sample(0:1, 200*50, replace = TRUE),200,50)
If I have 200 rows for each column and I set a threshold like 50 up and 30 down.
how can I check whether the 50 rows in top of each column contains more 1 values or the 30 rows down of each column or the middle ?
how can I then plot something to show graphically the results ?
By doing
f <- function(x, u = 200, d = 200){
res <- list(NA)
for(i in 1:ncol(x)){
res[[i]] <- c(sum(x[1:u,i] == 1), sum(x[(u+1):(nrow(x)- d),i] == 1), sum(x[(nrow(x)-d+1):nrow(x),i] == 1))
}
res <- do.call(rbind, res)
res
}
then calculate
res_value <- f(output)
the res_values can be found here
https://gist.github.com/anonymous/a1f68b9798affe630e65
df <- data.frame(cbind(c(t(res_value)), rep(1:50, each = 3)), X3 = rep(1:3))
ggplot(df, aes(x = factor(X2), y = X1, fill = as.factor(X3))) + geom_bar(position="fill", stat = "identity")
I got a warning like below
Warning message:
In cbind(c(t(res_value)), rep(1:50, each = 3)) :
number of rows of result is not a multiple of vector length (arg 2)
and of course the plot is like below which is not good at all
What about this? First write a function to calculate the number of ones in each of the three groups using the thresholds (u and d) and then plot the result as filled barplot:
f <- function(x, u = 50, d = 30){
res <- list(NA)
for(i in 1:ncol(x)){
res[[i]] <- c(sum(x[1:u,i] == 1), sum(x[(u+1):(nrow(x)- d),i] == 1), sum(x[(nrow(x)-d+1):nrow(x),i] == 1))
}
res <- do.call(rbind, res)
res
}
res <- f(Mat)
df <- data.frame(cbind(c(t(res)), rep(1:50, each = 3)), X3 = rep(1:3))
ggplot(df, aes(x = factor(X2), y = X1, fill = as.factor(X3))) + geom_bar(position="fill", stat = "identity")
Group 1 is the upper group, 2 the middle and 3 the bottom group. If you want the exact numers to be plotted instead of normalized values you can set position = stack
This is how to view the matrix...
image(Mat)
You can try something like this:
Mat <- matrix(sample(0:1, 200*50, replace = TRUE),200,50)
high_t<-70
bottom_t<-70
sums <- rbind(colSums(Mat[1:high_t,]),colSums(Mat[(high_t+1):(nrow(Mat)-bottom_t),]),colSums(Mat[(nrow(Mat)-bottom_t+1):nrow(Mat),]))
res <- apply(sums,2,which.max)
For each interval, use colSums to sum the columns, then rbind the results and use which.max to find which interval has the most 1s, 1 for top, 2 for middle and 3 for bottom.
I changed your thresholds because the middle always wins if you choose 50 and 30 (the middle then has 120 rows)
library(reshape2)
library(gplots)
library(ggplot2)
Mat <- matrix(sample(0:1, 200*50, replace = TRUE), 200, 50)
low_cut <- 50
high_cut <- 30
lows <- apply(Mat, 2, function(x) sum(x[1:low_cut]))
highs <- apply(Mat, 2, function(x) sum(x[(length(x)-high_cut):length(x)]))
totals <- colSums(Mat)
mids <- totals - lows - highs
results <- data.frame(id = 1:NCOL(Mat),
lows = lows,
mids = mids,
highs = highs)
excludeVars <- names(results) %in% c('id')
image(as.matrix(results[!excludeVars]))
heatmap.2(as.matrix(results[!excludeVars]),
trace = "none")
melted_results <- melt(results, id.vars = "id")
ggplot(melted_results, aes(x=variable, y=id)) +
geom_tile(aes(fill=value))

Faceted qqplots with ggplot2

Say I have the following data:
datapoints1 = data.frame(categ=c(rep(1, n), rep(2, n)), vals1=c(rt(n, 1, 2), rnorm(n, 3, 4)))
datapoints2 = data.frame(categ=c(rep(1, n), rep(2, n)), vals2=c(rt(n, 5, 6), rnorm(n, 7, 8)))
Using ggplot2, how can I use the facet functionality to create in a single command two QQplots, i.e. one with the two t samples, the other with the two Gaussian samples?
First, combine both data frames:
dat <- cbind(datapoints1, vals2 = datapoints2[ , 2])
Then, sort the data:
dat_sort <- do.call("rbind", lapply(unique(dat$categ), FUN = function(x) {data.frame(categ = x, vals1 = sort(dat$vals1[dat$categ == x]), vals2 = sort(dat$vals2[dat$categ == x]))}))
It is simple if both sample vectors are of the same length:
ggplot() +
geom_point(data = dat_sort, aes(x = vals1, y = vals2)) +
facet_wrap( ~ categ, scales = "free")
An example with n = 1000:

Resources