Related
I am trying to add lm model coefs of two parallel modelling results onto the same ggplot plot. Here is my working example:
library(ggplot2)
set.seed(100)
dat <- data.frame(
x <- rnorm(100, 1),
y <- rnorm(100, 10),
lev <- gl(n = 2, k = 50, labels = letters[1:2])
)
mod1 <- lm(y~x, dat = dat[lev %in% "a", ])
r1 <- paste("R^2==", round(summary(mod1)[[9]], 3))
p1<- paste("p==", round(summary(mod1)[[4]][2, 4], 3), sep= "")
lab1 <- paste(r1, p1, sep =",")
mod2 <- lm(y~x, dat = dat[lev %in% "b", ])
r2 <- paste("R^2==", round(summary(mod2)[[9]], 3))
p2 <- paste("p==", round(summary(mod2)[[4]][2, 4], 3), sep= "")
lab2 <- paste(r2, p2, sep =",")
ggplot(dat, aes(x = x, y = y, col = lev)) + geom_jitter() + geom_smooth(method = "lm") + annotate("text", x = 2, y = 12, label = lab1, parse = T) + annotate("text", x = 10, y = 8, label = lab2, parse = T)
Here is the promot shows:
Error in parse(text = text[[i]]) : <text>:1:12: unexpected ','
1: R^2== 0.008,
Now the problem is that I could label either R2 or p value seperately, but not both of them together. How could I do to put the two results into one single line on the figure?
BTW, any other efficienty way of doing the same thing as my code? I have nine subplots that I want to put into one full plot, and I don't want to add them one by one.
++++++++++++++++++++++++++ Some update ++++++++++++++++++++++++++++++++++
Following #G. Grothendieck 's kind suggestion and idea, I tried to wrap the most repeatative part of the codes into a function, so I could finish all the plot with a few lines. Now the problem is that, whatever I changed the input variables, the output plot are basically the same, except the axis labels. Can anyone explain why? The following is the working code I used:
library(ggplot2)
library(ggpubr)
set.seed(100)
dat <- data.frame(
x = rnorm(100, 1),
y = rnorm(100, 10),
z = rnorm(100, 25),
lev = gl(n = 2, k = 50, labels = letters[1:2])
)
test <- function(dat, x, y){
fmt <- "%s: Adj ~ R^2 == %.3f * ',' ~ {p == %.3f}"
mod1 <- lm(y ~ x, dat, subset = lev == "a")
sum1 <- summary(mod1)
lab1 <- sprintf(fmt, "a", sum1$adj.r.squared, coef(sum1)[2, 4])
mod2 <- lm(y ~ x, dat, subset = lev == "b")
sum2 <- summary(mod2)
lab2 <- sprintf(fmt, "b", sum2$adj.r.squared, coef(sum2)[2, 4])
colors <- 1:2
p <- ggplot(dat, aes(x = x, y = y, col = lev)) +
geom_jitter() +
geom_smooth(method = "lm") +
annotate("text", x = 2, y = c(12, 8), label = c(lab1, lab2),
parse = TRUE, hjust = 0, color = colors) +
scale_color_manual(values = colors)
return(p)
}
ggarrange(test(dat, x, z), test(dat, y, z))
There are several problems here:
x, y and lev are arguments to data.frame so they must be specified using = rather than <-
make use of the subset= argument in lm
use sprintf instead of paste to simplify the specification of labels
label the text strings a and b and make them the same color as the corresponding lines to identify which is which
the formula syntax needs to be corrected. See fmt below.
it would be clearer to use component names and accessor functions of the summary objects where available
use TRUE rather than T because the latter can be overridden if there is a variable called T but TRUE can never be overridden.
use hjust=0 and adjust the x= and y= in annotate to align the two text strings
combine the annotate statements
place the individual terms of the ggplot statement on separate lines for improved readability
This gives:
library(ggplot2)
set.seed(100)
dat <- data.frame(
x = rnorm(100, 1),
y = rnorm(100, 10),
lev = gl(n = 2, k = 50, labels = letters[1:2])
)
fmt <- "%s: Adj ~ R^2 == %.3f * ',' ~ {p == %.3f}"
mod1 <- lm(y ~ x, dat, subset = lev == "a")
sum1 <- summary(mod1)
lab1 <- sprintf(fmt, "a", sum1$adj.r.squared, coef(sum1)[2, 4])
mod2 <- lm(y ~ x, dat, subset = lev == "b")
sum2 <- summary(mod2)
lab2 <- sprintf(fmt, "b", sum2$adj.r.squared, coef(sum2)[2, 4])
colors <- 1:2
ggplot(dat, aes(x = x, y = y, col = lev)) +
geom_jitter() +
geom_smooth(method = "lm") +
annotate("text", x = 2, y = c(12, 8), label = c(lab1, lab2),
parse = TRUE, hjust = 0, color = colors) +
scale_color_manual(values = colors)
Unless I'm misunderstanding your question, the problem's with the parse = T arguments to your annotate calls. I don't think your strings need to be parsed. Try parse = F instead, or just drop the parameter, as the default value seems to be FALSE anyway
I wanted to make a graph using facet_wrap and plot it in different pages in a pdf file. I've read son many options, and this works:
R + ggplot: plotting over multiple pages
but only when you have the same rows in each page.
I have this demo data to try explain my case:
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
So, I have ID with different number of observations. I tried to make a vector with the number of observation in each ID (I want an ID per page), but it doesn't work.
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
slice2 = slice[!(slice > nrow(A2))]
A3 = A2[slice2,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
Could you help me?
Thanks in advances,
Nacho
I think you were overthinking trying to calculate your "slices". Maybe you want this?
Not entirely sure. If you only want one ID per page you don't need facet_wrap, and you will probably need to set the scale explicitly to keep it the same from page to page.
library(plyr)
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
# slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
# slice2 = slice[!(slice > nrow(A2))]
# A3 = A2[slice2,]
A3 = A2[A2$ID==i,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
I'm producing a plot like this:
library(ggplot2)
data.dist = matrix(
c(10, -10, 10, -10, 10, -10, 10, -10, 10),
nrow=3,
ncol=3,
byrow = TRUE)
hc <- agnes(dist(data.dist), method = "ward", diss = TRUE)
cluster <- cutree(hc, k=2)
xy <- data.frame(cmdscale(dist(data.dist)), factor(cluster))
names(xy) <- c("x", "y", "cluster")
xy$model <- rownames(xy)
ggplot(xy, aes(x, y)) + geom_point(aes(colour=cluster), size=3)
Which gives me:
However, let's say I want to attach another covariate, say a binary variable c(1, 0, 1) to the data and display all 1 using one symbol (say an X) and all 0 using another symbol (say a dot). How can I accomplish this?
xy<-data.frame(x=rnorm(3),y=rnorm(3),cluster=as.factor(c(1,0,1)),another=as.factor(c(1,1,0)) )
ggplot(xy, aes(x, y,shape=another)) + geom_point(aes(colour=cluster), size=3)
I'm trying to modify the facet labels in the plot created in this post.
To repeat the code:
library(ggplot2)
mymatrix1 <- matrix(data = 0, nrow = 105, ncol =2)
mymatrix2 <- matrix(data = 0, nrow = 108, ncol =2)
mymatrix3 <- matrix(data = 0, nrow = 112, ncol =2)
mymatrix1[,1] <- sample(0:1, 105, replace= TRUE)
mymatrix1[,2] <- rnorm(105, 52, 2)
mymatrix2[,1] <- sample(0:1, 108, replace= TRUE)
mymatrix2[,2] <- rnorm(108, 60, 2)
mymatrix3[,1] <- sample(0:1, 112, replace= TRUE)
mymatrix3[,2] <- rnorm(112, 70, 2)
mydata <- list(mymatrix1, mymatrix2,mymatrix3)
for(i in 1:3){
mydata[[i]] <- cbind(mydata[[i]], i)
colnames(mydata[[i]]) <- c("class", "readcount", "group")
}
mydata <- as.data.frame(do.call(rbind, mydata))
## fixed scales
p <- qplot(class, readcount, data = mydata, geom="boxplot", fill = factor(class)) +
geom_jitter(position=position_jitter(w=0.1,h=0.1)) +
scale_x_continuous(breaks=c(0,1), labels=c("0", "1")) +
facet_wrap(~ group)
## free scales
p + facet_wrap(~ group, scales = "free")
My goal, in this plot, is to change the facet labels 1, 2 and 3 to something like "a" "b" or "c."
I thought that simply changing the data frame column "group" to letters instead of numbers in the for loop:
for(i in 1:3){
mydata[[i]] <- cbind(mydata[[i]], letters[i])
colnames(mydata[[i]]) <- c("class", "readcount", "group")
}
would work, but I get this error:
Error: Discrete value supplied to continuous scale.
Any suggestions? I looked at the other posts for changing facet labels, but they didn't seem to work for this type of plot (at least for me).
Thanks so much.
Are you looking for something like this?
mydata$group <- as.factor(mydata$group)
levels(mydata$group) <- c("A","B","C")
I am trying to output multiple density plot from a function, by dividing the dataframe into pieces such that separate density for each level of a factor for corresponding yvar.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(gen, yvar)
minyvar <- min(yvar)
maxyvar <- max(yvar)
par(mfrow = c(length(levels(mydf$gen)),1))
plotdensity <- function (xf, minyvar, maxyvar){
plot(density(xf), xlim=c(minyvar, maxyvar), main = paste (names(xf),
"distribution", sep = ""))
dens <- density(xf)
x1 <- min(which(dens$x >= quantile(xf, .80)))
x2 <- max(which(dens$x < max(dens$x)))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="blu4"))
abline(v= mean(xf), col = "black", lty = 1, lwd =2)
}
require(plyr)
ddply(mydf, .(mydf$gen), plotdensity, yvar, minyvar, maxyvar)
Error in .fun(piece, ...) : unused argument(s) (111.544494112914)
My specific expectation are each plot is named by name of level for example Aa, Bb, Cc, Dd
Arrangement of the graphs see the parameter set, so that we compare density changes and means. compact - Low space between the graphs.
Help appreciated.
Edits:
The following graphs are individually produced, although I want to develop a function that can be applicable to x level for a factor.
I see that #Andrie just beat me to most of this. I'm still going to post my answer, since filling only certain quantiles of the distribution requires a slightly different approach.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
ggplot(mydf,aes(x = x)) +
facet_wrap(~grp) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0, fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),colour = "black")
Here is a way of doing it in ggplot:
set.seed(1234)
mydf <- rbind(
data.frame(gen="Aa", yvar= rnorm(40000, 50, 10)),
data.frame(gen="Bb", yvar=rnorm(4000, 70, 10)),
data.frame(gen="Cc", yvar=rnorm(400, 75, 10)),
data.frame(gen="Dd", yvar=rnorm(40, 80, 10))
)
labels <- ddply(mydf, .(gen), nrow)
means <- ddply(mydf, .(gen), summarize, mean=mean(yvar))
ggplot(mydf, aes(x=yvar)) +
stat_density(fill="blue") +
facet_grid(gen~.) +
theme_bw() +
geom_vline(data=means, aes(xintercept=mean), colour="red") +
geom_text(data=labels, aes(label=paste("n =", V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution")
With sincere thanks to joran and Andrie, the following is just compilation of my favorite from above two posts, just some of readers might want to see.
require(ggplot2)
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
mydf1 <- mydf
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
labels <- ddply(mydf1, .(grp), nrow)
ggplot(mydf,aes(x = x)) +
facet_grid(grp~.) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0,
fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),
colour = "black") + geom_text(data=labels,
aes(label=paste("n =", labels$V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution") + theme_bw()