ggplot, plotting in multiple pages with different rows

ggplot, plotting in multiple pages with different rows - r

I wanted to make a graph using facet_wrap and plot it in different pages in a pdf file. I've read son many options, and this works:
R + ggplot: plotting over multiple pages
but only when you have the same rows in each page.
I have this demo data to try explain my case:
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
So, I have ID with different number of observations. I tried to make a vector with the number of observation in each ID (I want an ID per page), but it doesn't work.
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
slice2 = slice[!(slice > nrow(A2))]
A3 = A2[slice2,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
Could you help me?
Thanks in advances,
Nacho

I think you were overthinking trying to calculate your "slices". Maybe you want this?
Not entirely sure. If you only want one ID per page you don't need facet_wrap, and you will probably need to set the scale explicitly to keep it the same from page to page.
library(plyr)
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
# slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
# slice2 = slice[!(slice > nrow(A2))]
# A3 = A2[slice2,]
A3 = A2[A2$ID==i,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()

Related

Creating a plot from random sample with mean and standard deviation

Repeat the following 10 times and calculate the mean each time: sample
30 observations from a normally-distributed population having mean 0
and standard deviation 2. Create a data.frame containing the output
from the 10 simulations and generate a single plot demonstrating the
mean and st.dev of each of 10 samples.
I am a complete beginner and don't know where to go from here:
tensample <- replicate(10, rnorm(30, mean = 0, sd = 2))
tensampleDF <- data.frame(tensample)
I know I can find the mean and sd of each of the samples like so:
means <- colMeans(tensampleDF)
sd <- apply(tensampleDF, 2, sd)
But how to plot them together?

This will off course depend on which graphics system ist meant to be used. This is a way for base graphics:
tensample <- replicate(10, rnorm(30, mean = 0, sd = 2))
tensampleDF <- data.frame(tensample)
m <- colMeans(tensampleDF)
upper <- m + apply(tensampleDF, 2, sd)
lower <- m - apply(tensampleDF, 2, sd)
plot(1:10, colMeans(tensampleDF), pch = 15, ylim = c(-5, 5),
xlab = "x", ylab = "y")
arrows(x0 =1:10, x1 = 1:10, y0 = lower, y1 = upper, length = 0)
It will produce something like

This is a ggplot2 answer
tensample <- replicate(10, rnorm(30, mean = 0, sd = 2))
tensampleDF <- data.frame(tensample)
m = colMeans(tensampleDF)
d <- data.frame(id = 1:10,
m = m,
upper = m + apply(tensampleDF, 2, sd),
lower = m - apply(tensampleDF, 2, sd))
library(ggplot2)
ggplot(d) +
geom_pointrange(aes(x=id, y=m, ymin=lower, ymax = upper))
You should correct the x-axis stops etc but now your are free to choose the graphics system.
Edit:
In order to achieve acceptable axes maybe do something more along the lines of
ggplot(d) +
geom_pointrange(aes(x=id, y=m, ymin=lower, ymax = upper)) +
scale_x_continuous(breaks = 1:10, minor_breaks = NULL) +
xlab("x") +
ylab("y") +
theme_bw()

R: ggplot2 and loop with data.table

I have to make 4 plots which differ only for y and ylab.
I start from a data.table dt which is
set.seed(123)
dt <- data.table(a = rnorm(20),
b = rnorm(20),
c = rnorm(20),
d = rnorm(20),
e = rnorm(20))
Every single plot should be a scatterplot with row numbers as x vs y values. Additionally, I want to plot some hline at median(y) + h*mad(y) where h = c(0, -2, 2, -3, 3)
This plot should be repeated for columns a, c, d and e of dt.
I came up with this bit of code
# Defining y labels #
ylabels <- c(bquote(phantom(.)^100*A~"/"*phantom(.)^200*A),
bquote(phantom(.)^101*C~"/"*phantom(.)^201*B),
bquote(phantom(.)^102*D~"/"*phantom(.)^202*D),
bquote(phantom(.)^103*E~"/"*phantom(.)^202*E))
# Selecting columns of dt
ydata <- names(dt)[c(1, 3, 4, 5)]
h <- c(0, -2, 2, -3, 3)
hcol <- c("#009E73", "#E69F00", "#E69F00", "red", "red")
# The for cycle should create the 4 plots and assign them to a list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(aes_string(yintercept = median(ydata[i]) +
h * mad(ydata[i])), color = hcol) +
xlab("Replicate") +
ylab(ylabels[i]) +
scale_x_continuous(breaks = seq(1, dt[,.N])))
plots[[i]] <- p1 # add each plot into plot list
}
Then plots will be fed to the multiplot function from Cookbook for R.
However my loop doesn't work properly because it fails to calculate the median and mad values.
Do you have any suggestions to make the code work?

# data.table with the median +- h* mad values
hline.values <- dt[, lapply(.SD, function(x) median(x) + h * mad(x)),
.SDcols = ydata]
# new empty list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(data = hline.values,
aes_string(yintercept = ydata[i])) +
# Axis labels and theme
xlab("Replicate") +
ylab(ylabels[[i]]) +
scale_x_continuous(breaks = seq(1, dt[, .N]))
plots[[i]] <- p1
}

Benchmark for several sample sizes

I know how to compare several solutions in terms of performance with mircrobenchmark. But quite often I need to do it over several sample sizes, e.g. 10^4, 10^5, 10^6, 10^7 and box plot results with sample size on the x axis.
I have got this code that does the lines for mean:
set.seed(1001)
N <- c(3, 4, 5, 6 ,7)
n <- 10^N
res <- lapply(n, function(x) {
d <- sample(1:x/10, 5 * x, replace=T)
d <- c(d, sample(d, x, replace=T)) # ensure many duplicates
dt <- data.table(d)
mb <- microbenchmark::microbenchmark(
"duplicated(original)" = d[!(duplicated(d) | duplicated(d, fromLast=TRUE))],
"tabulate" = { ud = unique(d); ud[tabulate(match(d, ud)) == 1L] },
"data.table" = dt[, count:= .N, by = d][count == 1]$d,
times = 1,unit = "ms")
sm <- summary(mb)[, c(1, 4, 8)]
sm$size = x
return(sm)
})
res <- do.call("rbind", res)
require(ggplot2)
##The values Year, Value, School_ID are
##inherited by the geoms
ggplot(res, aes(x = res$size, y = res$mean, colour=res$exp)) +
geom_line() + scale_x_log10() + scale_y_log10() +
geom_point()
By it would be much nicer to have box plot if times parameters of microbenchmark is not one. Is there an elegant way to achieve this?

Faceted qqplots with ggplot2

Say I have the following data:
datapoints1 = data.frame(categ=c(rep(1, n), rep(2, n)), vals1=c(rt(n, 1, 2), rnorm(n, 3, 4)))
datapoints2 = data.frame(categ=c(rep(1, n), rep(2, n)), vals2=c(rt(n, 5, 6), rnorm(n, 7, 8)))
Using ggplot2, how can I use the facet functionality to create in a single command two QQplots, i.e. one with the two t samples, the other with the two Gaussian samples?

First, combine both data frames:
dat <- cbind(datapoints1, vals2 = datapoints2[ , 2])
Then, sort the data:
dat_sort <- do.call("rbind", lapply(unique(dat$categ), FUN = function(x) {data.frame(categ = x, vals1 = sort(dat$vals1[dat$categ == x]), vals2 = sort(dat$vals2[dat$categ == x]))}))
It is simple if both sample vectors are of the same length:
ggplot() +
geom_point(data = dat_sort, aes(x = vals1, y = vals2)) +
facet_wrap( ~ categ, scales = "free")
An example with n = 1000:

Multiple density graphs different groups (based on factor level) using plyr

I am trying to output multiple density plot from a function, by dividing the dataframe into pieces such that separate density for each level of a factor for corresponding yvar.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(gen, yvar)
minyvar <- min(yvar)
maxyvar <- max(yvar)
par(mfrow = c(length(levels(mydf$gen)),1))
plotdensity <- function (xf, minyvar, maxyvar){
plot(density(xf), xlim=c(minyvar, maxyvar), main = paste (names(xf),
"distribution", sep = ""))
dens <- density(xf)
x1 <- min(which(dens$x >= quantile(xf, .80)))
x2 <- max(which(dens$x < max(dens$x)))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="blu4"))
abline(v= mean(xf), col = "black", lty = 1, lwd =2)
}
require(plyr)
ddply(mydf, .(mydf$gen), plotdensity, yvar, minyvar, maxyvar)
Error in .fun(piece, ...) : unused argument(s) (111.544494112914)
My specific expectation are each plot is named by name of level for example Aa, Bb, Cc, Dd
Arrangement of the graphs see the parameter set, so that we compare density changes and means. compact - Low space between the graphs.
Help appreciated.
Edits:
The following graphs are individually produced, although I want to develop a function that can be applicable to x level for a factor.

I see that #Andrie just beat me to most of this. I'm still going to post my answer, since filling only certain quantiles of the distribution requires a slightly different approach.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
ggplot(mydf,aes(x = x)) +
facet_wrap(~grp) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0, fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),colour = "black")

Here is a way of doing it in ggplot:
set.seed(1234)
mydf <- rbind(
data.frame(gen="Aa", yvar= rnorm(40000, 50, 10)),
data.frame(gen="Bb", yvar=rnorm(4000, 70, 10)),
data.frame(gen="Cc", yvar=rnorm(400, 75, 10)),
data.frame(gen="Dd", yvar=rnorm(40, 80, 10))
)
labels <- ddply(mydf, .(gen), nrow)
means <- ddply(mydf, .(gen), summarize, mean=mean(yvar))
ggplot(mydf, aes(x=yvar)) +
stat_density(fill="blue") +
facet_grid(gen~.) +
theme_bw() +
geom_vline(data=means, aes(xintercept=mean), colour="red") +
geom_text(data=labels, aes(label=paste("n =", V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution")

With sincere thanks to joran and Andrie, the following is just compilation of my favorite from above two posts, just some of readers might want to see.
require(ggplot2)
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
mydf1 <- mydf
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
labels <- ddply(mydf1, .(grp), nrow)
ggplot(mydf,aes(x = x)) +
facet_grid(grp~.) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0,
fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),
colour = "black") + geom_text(data=labels,
aes(label=paste("n =", labels$V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution") + theme_bw()

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

ggplot, plotting in multiple pages with different rows - r

Related

Creating a plot from random sample with mean and standard deviation

R: ggplot2 and loop with data.table

Benchmark for several sample sizes

Faceted qqplots with ggplot2

Multiple density graphs different groups (based on factor level) using plyr

Categories

Resources