Faceted qqplots with ggplot2

Faceted qqplots with ggplot2 - r

Say I have the following data:
datapoints1 = data.frame(categ=c(rep(1, n), rep(2, n)), vals1=c(rt(n, 1, 2), rnorm(n, 3, 4)))
datapoints2 = data.frame(categ=c(rep(1, n), rep(2, n)), vals2=c(rt(n, 5, 6), rnorm(n, 7, 8)))
Using ggplot2, how can I use the facet functionality to create in a single command two QQplots, i.e. one with the two t samples, the other with the two Gaussian samples?

First, combine both data frames:
dat <- cbind(datapoints1, vals2 = datapoints2[ , 2])
Then, sort the data:
dat_sort <- do.call("rbind", lapply(unique(dat$categ), FUN = function(x) {data.frame(categ = x, vals1 = sort(dat$vals1[dat$categ == x]), vals2 = sort(dat$vals2[dat$categ == x]))}))
It is simple if both sample vectors are of the same length:
ggplot() +
geom_point(data = dat_sort, aes(x = vals1, y = vals2)) +
facet_wrap( ~ categ, scales = "free")
An example with n = 1000:

Related

ggforce geom_mark_ellipse spanning categorial / factor values

I'm trying to use geom_mark_ellipse from ggforce package for circling a specific subset of my data. While one dimension of my data is numeric, the other is categorial and when trying to add a single ellipse geom_mark_ellipse draws two ellipses for each applicable value of the categorial dimension. This probably sounds more complicated than is, so here's a simple example:
library(tidyverse)
library(ggforce)
set.seed(20)
my_data <- tibble(x = rnorm(20, 7, 5),
y = factor(rep(c("a", "b", "c", "d"), 5), ordered = TRUE),
z = rnorm(20, 10, 2))
ggplot(my_data, aes(x, y, size = z)) +
geom_point() +
geom_mark_ellipse(size = 1, aes(filter = ((y %in% c("c", "d")) & (x > 6) ))) +
ggtitle("geom_ellipse with factor on y-axis")
This yields following chart:
While I'd like to get a chart like this:
set.seed(20)
my_data2 <- tibble(x = rnorm(20, 7, 5),
y = rep(c(1,2,3,4), 5),
z = rnorm(20, 10, 2))
ggplot(my_data2, aes(x, y, size = z)) +
geom_point() +
geom_mark_ellipse(size = 1, aes(filter = ((y %in% c(3, 4)) & (x > 6) ))) +
ggtitle("geom_ellipse with numerical on y-axis")
Certainly, a workaround is to coerce the factor into numeric - still: is there a way to "tell" geom_mark_ellipse to span factor levels (or other type of categorial variable values) or is this behaviour an intended feature?

Labelling R2 and p value in ggplot?

I am trying to add lm model coefs of two parallel modelling results onto the same ggplot plot. Here is my working example:
library(ggplot2)
set.seed(100)
dat <- data.frame(
x <- rnorm(100, 1),
y <- rnorm(100, 10),
lev <- gl(n = 2, k = 50, labels = letters[1:2])
)
mod1 <- lm(y~x, dat = dat[lev %in% "a", ])
r1 <- paste("R^2==", round(summary(mod1)[[9]], 3))
p1<- paste("p==", round(summary(mod1)[[4]][2, 4], 3), sep= "")
lab1 <- paste(r1, p1, sep =",")
mod2 <- lm(y~x, dat = dat[lev %in% "b", ])
r2 <- paste("R^2==", round(summary(mod2)[[9]], 3))
p2 <- paste("p==", round(summary(mod2)[[4]][2, 4], 3), sep= "")
lab2 <- paste(r2, p2, sep =",")
ggplot(dat, aes(x = x, y = y, col = lev)) + geom_jitter() + geom_smooth(method = "lm") + annotate("text", x = 2, y = 12, label = lab1, parse = T) + annotate("text", x = 10, y = 8, label = lab2, parse = T)
Here is the promot shows:
Error in parse(text = text[[i]]) : <text>:1:12: unexpected ','
1: R^2== 0.008,
Now the problem is that I could label either R2 or p value seperately, but not both of them together. How could I do to put the two results into one single line on the figure?
BTW, any other efficienty way of doing the same thing as my code? I have nine subplots that I want to put into one full plot, and I don't want to add them one by one.
++++++++++++++++++++++++++ Some update ++++++++++++++++++++++++++++++++++
Following #G. Grothendieck 's kind suggestion and idea, I tried to wrap the most repeatative part of the codes into a function, so I could finish all the plot with a few lines. Now the problem is that, whatever I changed the input variables, the output plot are basically the same, except the axis labels. Can anyone explain why? The following is the working code I used:
library(ggplot2)
library(ggpubr)
set.seed(100)
dat <- data.frame(
x = rnorm(100, 1),
y = rnorm(100, 10),
z = rnorm(100, 25),
lev = gl(n = 2, k = 50, labels = letters[1:2])
)
test <- function(dat, x, y){
fmt <- "%s: Adj ~ R^2 == %.3f * ',' ~ {p == %.3f}"
mod1 <- lm(y ~ x, dat, subset = lev == "a")
sum1 <- summary(mod1)
lab1 <- sprintf(fmt, "a", sum1$adj.r.squared, coef(sum1)[2, 4])
mod2 <- lm(y ~ x, dat, subset = lev == "b")
sum2 <- summary(mod2)
lab2 <- sprintf(fmt, "b", sum2$adj.r.squared, coef(sum2)[2, 4])
colors <- 1:2
p <- ggplot(dat, aes(x = x, y = y, col = lev)) +
geom_jitter() +
geom_smooth(method = "lm") +
annotate("text", x = 2, y = c(12, 8), label = c(lab1, lab2),
parse = TRUE, hjust = 0, color = colors) +
scale_color_manual(values = colors)
return(p)
}
ggarrange(test(dat, x, z), test(dat, y, z))

There are several problems here:
x, y and lev are arguments to data.frame so they must be specified using = rather than <-
make use of the subset= argument in lm
use sprintf instead of paste to simplify the specification of labels
label the text strings a and b and make them the same color as the corresponding lines to identify which is which
the formula syntax needs to be corrected. See fmt below.
it would be clearer to use component names and accessor functions of the summary objects where available
use TRUE rather than T because the latter can be overridden if there is a variable called T but TRUE can never be overridden.
use hjust=0 and adjust the x= and y= in annotate to align the two text strings
combine the annotate statements
place the individual terms of the ggplot statement on separate lines for improved readability
This gives:
library(ggplot2)
set.seed(100)
dat <- data.frame(
x = rnorm(100, 1),
y = rnorm(100, 10),
lev = gl(n = 2, k = 50, labels = letters[1:2])
)
fmt <- "%s: Adj ~ R^2 == %.3f * ',' ~ {p == %.3f}"
mod1 <- lm(y ~ x, dat, subset = lev == "a")
sum1 <- summary(mod1)
lab1 <- sprintf(fmt, "a", sum1$adj.r.squared, coef(sum1)[2, 4])
mod2 <- lm(y ~ x, dat, subset = lev == "b")
sum2 <- summary(mod2)
lab2 <- sprintf(fmt, "b", sum2$adj.r.squared, coef(sum2)[2, 4])
colors <- 1:2
ggplot(dat, aes(x = x, y = y, col = lev)) +
geom_jitter() +
geom_smooth(method = "lm") +
annotate("text", x = 2, y = c(12, 8), label = c(lab1, lab2),
parse = TRUE, hjust = 0, color = colors) +
scale_color_manual(values = colors)

Unless I'm misunderstanding your question, the problem's with the parse = T arguments to your annotate calls. I don't think your strings need to be parsed. Try parse = F instead, or just drop the parameter, as the default value seems to be FALSE anyway

Interpolate curved line betweenstart and end points for ggplot2

I'd like to create a sankey-like plot that I can create in ggplot2 where there are curved lines between my start and end locations. Currently, I have data that looks like this:
df <- data.frame(Line = rep(letters[1:4], 2),
Location = rep(c("Start", "End"), each=4),
X = rep(c(1, 10), each = 4),
Y = c(c(1,3, 5, 15), c(9,12, 14, 6)),
stringsAsFactors = F)
ex:
Line Location X Y
1 a Start 1 1
2 a End 10 9
and creates a plot that looks something like this:
library(ggplot2)
ggplot(df) +
geom_path(aes(x= X, y= Y, group = Line))
I would like to see the data come out like this:
This is another option for setting up the data:
df2 <- data.frame(Line = letters[1:4],
Start.X= rep(1, 4),
Start.Y = c(1,3,5,15),
End.X = rep(10, 4),
End.Y = c(9,12,14,6))
ex:
Line Start.X Start.Y End.X End.Y
1 a 1 1 10 9
I can find examples of how to add a curve to the graphics of base R but these examples don't demonstrate how to get a data frame of the points in between in order to draw that curve. I would prefer to use dplyr for data manipulation. I imagine this will require a for-loop to build a table of the interpolated points.
These examples are similar but do not produce an s-shaped curve:
Plotting lines on map - gcIntermediate
http://flowingdata.com/2011/05/11/how-to-map-connections-with-great-circles/
Thank you in advance!

The code below creates curved lines via a logistic function. You could use whatever function you like instead, but this is the main idea. I should note that for other than graphical purposes, creating a curved line out of 2 points is a bad idea. It implies that the data show a certain type of relation while it actually doesn't imply that relation.
df <- data.frame(Line = rep(letters[1:4], 2),
Location = rep(c("Start", "End"), each=4),
X = rep(c(1, 10), each = 4),
Y = c(c(1,3, 5, 15), c(9,12, 14, 6)),
stringsAsFactors = F)
# logistic function for curved lines
logistic = function(x, y, midpoint = mean(x)) {
ry = range(y)
if (y[1] < y[2]) {
sign = 2
} else {
sign = -2
}
steepness = sign*diff(range(x)) / diff(ry)
out = (ry[2] - ry[1]) / (1 + exp(-steepness * (x - midpoint))) + ry[1]
return(out)
}
# an example
x = c(1, 10)
y = c(1, 9)
xnew = seq(1, 10, .5)
ynew = logistic(xnew, y)
plot(x, y, type = 'b', bty = 'n', las = 1)
lines(xnew, ynew, col = 2, type = 'b')
# applying the function to your example
xnew = seq(min(df$X), max(df$X), .1) # new x grid
m = matrix(NA, length(xnew), 4) # matrix to store results
uniq = unique(df$Line) # loop over all unique values in df$Line
for (i in seq_along(uniq)) {
m[, i] = logistic(xnew, df$Y[df$Line == uniq[i]])
}
# base R plot
matplot(xnew, m, type = 'b', las = 1, bty = 'n', pch = 1)
# put stuff in a dataframe for ggplot
df2 = data.frame(x = rep(xnew, ncol(m)),
y = c(m),
group = factor(rep(1:ncol(m), each = nrow(m))))
library(ggplot2)
ggplot(df) +
geom_path(aes(x= X, y= Y, group = Line, color = Line)) +
geom_line(data = df2, aes(x = x, y = y, group = group, color = group))

ggplot, plotting in multiple pages with different rows

I wanted to make a graph using facet_wrap and plot it in different pages in a pdf file. I've read son many options, and this works:
R + ggplot: plotting over multiple pages
but only when you have the same rows in each page.
I have this demo data to try explain my case:
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
So, I have ID with different number of observations. I tried to make a vector with the number of observation in each ID (I want an ID per page), but it doesn't work.
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
slice2 = slice[!(slice > nrow(A2))]
A3 = A2[slice2,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
Could you help me?
Thanks in advances,
Nacho

I think you were overthinking trying to calculate your "slices". Maybe you want this?
Not entirely sure. If you only want one ID per page you don't need facet_wrap, and you will probably need to set the scale explicitly to keep it the same from page to page.
library(plyr)
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
# slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
# slice2 = slice[!(slice > nrow(A2))]
# A3 = A2[slice2,]
A3 = A2[A2$ID==i,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()

Multiple density graphs different groups (based on factor level) using plyr

I am trying to output multiple density plot from a function, by dividing the dataframe into pieces such that separate density for each level of a factor for corresponding yvar.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(gen, yvar)
minyvar <- min(yvar)
maxyvar <- max(yvar)
par(mfrow = c(length(levels(mydf$gen)),1))
plotdensity <- function (xf, minyvar, maxyvar){
plot(density(xf), xlim=c(minyvar, maxyvar), main = paste (names(xf),
"distribution", sep = ""))
dens <- density(xf)
x1 <- min(which(dens$x >= quantile(xf, .80)))
x2 <- max(which(dens$x < max(dens$x)))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="blu4"))
abline(v= mean(xf), col = "black", lty = 1, lwd =2)
}
require(plyr)
ddply(mydf, .(mydf$gen), plotdensity, yvar, minyvar, maxyvar)
Error in .fun(piece, ...) : unused argument(s) (111.544494112914)
My specific expectation are each plot is named by name of level for example Aa, Bb, Cc, Dd
Arrangement of the graphs see the parameter set, so that we compare density changes and means. compact - Low space between the graphs.
Help appreciated.
Edits:
The following graphs are individually produced, although I want to develop a function that can be applicable to x level for a factor.

I see that #Andrie just beat me to most of this. I'm still going to post my answer, since filling only certain quantiles of the distribution requires a slightly different approach.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
ggplot(mydf,aes(x = x)) +
facet_wrap(~grp) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0, fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),colour = "black")

Here is a way of doing it in ggplot:
set.seed(1234)
mydf <- rbind(
data.frame(gen="Aa", yvar= rnorm(40000, 50, 10)),
data.frame(gen="Bb", yvar=rnorm(4000, 70, 10)),
data.frame(gen="Cc", yvar=rnorm(400, 75, 10)),
data.frame(gen="Dd", yvar=rnorm(40, 80, 10))
)
labels <- ddply(mydf, .(gen), nrow)
means <- ddply(mydf, .(gen), summarize, mean=mean(yvar))
ggplot(mydf, aes(x=yvar)) +
stat_density(fill="blue") +
facet_grid(gen~.) +
theme_bw() +
geom_vline(data=means, aes(xintercept=mean), colour="red") +
geom_text(data=labels, aes(label=paste("n =", V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution")

With sincere thanks to joran and Andrie, the following is just compilation of my favorite from above two posts, just some of readers might want to see.
require(ggplot2)
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
mydf1 <- mydf
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
labels <- ddply(mydf1, .(grp), nrow)
ggplot(mydf,aes(x = x)) +
facet_grid(grp~.) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0,
fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),
colour = "black") + geom_text(data=labels,
aes(label=paste("n =", labels$V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution") + theme_bw()