How to create faceted linear regression plot using GGPLOT - r

I have a data frame created the following way.
library(ggplot2)
x <- data.frame(letters[1:10],abs(rnorm(10)),abs(rnorm(10)),type="x")
y <- data.frame(letters[1:10],abs(rnorm(10)),abs(rnorm(10)),type="y")
# in reality the number of row could be larger than 10 for each x and y
all <- rbind(x,y)
colnames(all) <- c("name","val1","val2","type")
What I want to do is to create a faceted ggplot that looks roughly like this:
Hence each facet above is the correlation plot of the following:
# Top left facet
subset(all,type=="x")$val1
subset(all,type=="y")$val1
# Top right facet
subset(all,type=="x")$val1
subset(all,type=="y")$val2
# ...etc..
But I'm stuck with the following code:
p <- ggplot(all, aes(val1, val2))+ geom_smooth(method = "lm") + geom_point() +
facet_grid(type ~ )
# Calculate correlation for each group
cors <- ddply(all, c(type ~ ), summarise, cor = round(cor(val1, val2), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=0.5, y=0.5)
What's the right way to do it?

Some of your code was incorrect. This works for me:
p <- ggplot(all, aes(val1, val2))+ geom_smooth(method = "lm") + geom_point() +
facet_grid(~type)
# Calculate correlation for each group
cors <- ddply(all, .(type), summarise, cor = round(cor(val1, val2), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=1, y=-0.25)
Edit: Following OP's comment and edit. The idea is to re-create the data with all four combinations and then facet.
# I consider the type in your previous data to be xx and yy
dat <- data.frame(val1 = c(rep(all$val1[all$type == "x"], 2),
rep(all$val1[all$type == "y"], 2)),
val2 = rep(all$val2, 2),
grp1 = rep(c("x", "x", "y", "y"), each=10),
grp2 = rep(c("x", "y", "x", "y"), each=10))
p <- ggplot(dat, aes(val1, val2)) + geom_point() + geom_smooth(method = "lm") +
facet_grid(grp1 ~ grp2)
cors <- ddply(dat, .(grp1, grp2), summarise, cor = round(cor(val1, val2), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=1, y=-0.25)

Since your data is not in the appropriate format, some reshaping is necessary before it can be plotted.
Firstly, reshape the data to the long format:
library(reshape2)
allM <- melt(all[-1], id.vars = "type")
Split the values along type and val1 vs. val2:
allList <- split(allM$value, interaction(allM$type, allM$variable))
Create a list of all combinations:
allComb <- unlist(lapply(c(1, 3),
function(x)
lapply(c(2 ,4),
function(y)
do.call(cbind, allList[c(x, y)]))),
recursive = FALSE)
Create a new dataset:
allNew <- do.call(rbind,
lapply(allComb, function(x) {
tmp <- as.data.frame(x)
tmp <- (within(tmp, {xval <- names(tmp)[1];
yval <- names(tmp)[2]}))
names(tmp)[1:2] <- c("x", "y")
tmp}))
Plot:
library(ggplot2)
p <- ggplot(allNew, aes(x = x, y = y)) +
geom_smooth(method = "lm") +
geom_point() +
facet_grid(yval ~ xval)
# Calculate correlation for each group
library(plyr)
cors <- ddply(allNew, .(yval, xval), summarise, cor = round(cor(x, y), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=0.5, y=0.5)

There is an additional package ggpubr available now addressing exactly this issue with the stat_cor() function.
library(tidyverse)
library(ggpubr)
ggplot(all, aes(val1, val2))+
geom_smooth(method = "lm") +
geom_point() +
facet_grid(~type) +
stat_cor()

Related

How to show integers when using ggplot2::geom_smooth()

In the example below, how can I round the x label to even numbers? I cant convert them as factors first, because then geom_smooth does not work
library(ggplot2)
set.seed(32)
df <- data.frame(a = as.integer(rnorm(250, 2, 0.1)))
df$b <- df$a + rnorm(250)
df$id = 1
df_2 <- df
df_2$id <- 2
df_tot <- rbind(df, df_2)
ggplot(df_tot, aes(x = a, y = b)) +
geom_smooth() +
facet_wrap(~id)
If we want even numbers, an option is to add labels as a function in scale_x_continuous
library(ggplot2)
ggplot(df_tot, aes(x = a, y = b)) +
geom_smooth() +
facet_wrap(~id) +
scale_x_continuous(labels = function(x) seq(2, length.out = length(x)))

ggplot of lm() with equation [duplicate]

I have read many postings on this topic using expression(), paste(), and bquote(), or some combination. I think I am close to solving my problem, but I just can't get there. The following script generates a plot labelled with "y = 1 + 2(x); r^2= 0.9". How can I italicize "y" and "x", and italicize the "r" and superscript the 2 of "r^2"? If I have overlooked a relevant earlier post, sorry, but please direct me to it.
df <- data.frame(x=c(1:5), y=c(1:5))
a <- 1
b <- 2
r2 <- 0.9
eq <- paste("y = ", a, " + ", b, "(x); r^2=", r2)
ggplot(data=df, aes(x=x, y=y))+
geom_point(color="black")+
geom_text(x=2, y=4,label=eq, parse=FALSE)
You could use annotate() which allows you to paste directly into the plot.
library(ggplot2)
ggplot(data=df, aes(x=x, y=y)) +
geom_point(color="black") +
annotate('text', 2.5, 4,
label=paste("italic(y)==", a, "+", b,
"~italic(x)~';'~italic(r)^2==", r2),
parse=TRUE,
hjust=1, size=5)
Yields:
Data:
df <- data.frame(x=c(1:5), y=c(1:5))
a <- 1
b <- 2
r2 <- 0.9
You can use a combination of substitute and plotmath (https://www.rdocumentation.org/packages/grDevices/versions/3.5.1/topics/plotmath) to italicize the text-
# setup
set.seed(123)
library(ggplot2)
# dataframe
df <- data.frame(x = c(1:5), y = c(1:5))
# label
eq <- substitute(
expr =
paste(
italic("y"),
" = ",
a,
" + ",
b,
"(",
italic("x"),
"); ",
italic("r") ^ 2,
" = ",
r2
),
env = base::list(a = 1,
b = 2,
r2 = 0.9)
)
# plot
ggplot(data = df, aes(x = x, y = y)) +
geom_point(color = "black") +
labs(subtitle = eq)
Created on 2018-12-04 by the reprex package (v0.2.1)
In addition to the answer by Indrajit Patil & jay-sf, I would like to add that there is an automated way to fit regression lines (I believe there are many), using a package called ggpmisc. The letters that you want in italic, are already formatted in such a way. The code that needs to be used is:
> install.packages('ggpmisc'); library(ggpmisc); formula <- y ~ x
> df <- data.frame(x=c(1:5), y=c(1:5))
> ggplot(data = df, aes(x, y)) + geom_point(color="black") +
geom_smooth(method = "lm", formula = formula) +
stat_poly_eq(aes(label = paste(..eq.label.., ..adj.rr.label.., sep = "~~~~")),
formula = formula, parse = TRUE)
It shows the fitted lines also, which I hope is not an impediment to the main goal.
EDIT: The line can be removed using linetype = 0, compatible with
most of the aesthetics in ggplot2.
... + geom_smooth(method = "lm", formula = formula, linetype = 0) + ...

How to format ggplot `geom_text` with formula, getting unwanted "c(...)"

In my ggplot2 code below, I want to show the formula for a linear-regression fit on my plot with geom_text, but I get unwanted c before the values of a and b, how do I prevent this?
p <- ggplot(data=Algae, aes(x=a254, y=DOC))+
geom_point(color="blue",stat="identity") +
geom_smooth(method="lm",se=FALSE,color="red",formula=y~x)
model.lm <- lm(DOC~a254, data=Algae)
l <- list(a=format(coef(model.lm)[1], digits=4),
b=format(coef(model.lm)[2], digits=4),
r2=format(summary(model.lm)$r.squared, digits=4),
p=format(summary(model.lm)$coefficients[2,4], digits=4))
eq <- substitute(italic(DOC) == a - b %*% italic(a254)~","~italic(R)^2~"="~r2~",
"~italic(P)~"="~p, l)
p1 <- p + geom_text(aes(x =6, y = 0, label = as.character(as.expression(eq))), parse = TRUE)
p1
The reason for this is that you first format() your data into character format and then try to calculate with strings. You could solve the problem this way:
First, it is more convenient to transform your list into a data.frame, using:
d <- as.data.frame(l)
The values should be converted back to numeric, since you yet want to do arithmetics inside the formula:
d[] <- lapply(d, function(x) as.numeric(as.character(x)))
Then it should work fine:
eq <- substitute(italic(Sepal.Length) == a - b %*% italic(Petal.Length)~","~italic(R)^2~"="~r2~",
"~italic(P)~"="~p, d)
p + geom_text(aes(x =5, y = 0, label = as.character(as.expression(eq))), parse = TRUE)
You could also use annotate() to add the formula to the plot, which might look a little nicer:
p + annotate('text', 7, 4,
label=paste("italic(Sepal.Length)==", d$a, "~-~", d$b, "~x~",
"~italic(Petal.Length)~';'~italic(r)^2==", d$r2,
"~italic(P)==", d$p),
parse=TRUE,
hjust=1, size=3.5)
Yielding:
Data:
library(ggplot2)
p <- ggplot(data=iris, aes(x=Petal.Length, y=Sepal.Length)) +
geom_point(color="blue", stat="identity") +
geom_smooth(method="lm", se=FALSE, color="red", formula=y~x)
model.lm <- lm(Sepal.Length ~ Petal.Length, data=iris)
l <- list(a=format(coef(model.lm)[1], digits=4),
b=format(coef(model.lm)[2], digits=4),
r2=format(summary(model.lm)$r.squared, digits=4),
p=format(summary(model.lm)$coefficients[2, 4], digits=4))

Getting a variable to pass into function in R (ggplot2)

I'm trying to plot a graph between two columns of data from the data frame called "final". I want the p value and r^2 value to show up on the graph.
I'm using this function and code, but it gives me the error "cannot find y value"
library(ggplot2)
lm_eqn <- function(final, x, y){
m <- lm(final[,y] ~ final[,x])
output <- paste("r.squared = ", round(summary(m)$adj.r.squared, digits = 4), " | p.value = ", formatC(summary(m)$coefficients[8], format = "e", digits = 4))
return(output)
}
output_plot <- lm_eqn(final, x, y)
p1 <- ggplot(final, aes(x=ENSG00000153563, y= ENSG00000163599)) + geom_point() + geom_smooth(method=lm, se=FALSE) + labs(x = "CD8A", y = "CTLA-4") + ggtitle("CD8 v/s CTLA-4", subtitle = paste("Linear Regression of Expression |", output_plot))
How do I get both columns of data x and y to flow through the function and for the graph to plot with the p value and residual value printed on the graph?
Thanks in advance.
When you call function for output_plot generation you have to use the same ENS... variables as in your plot. After simplifying slightly function, should work now
library(stats)
library(ggplot2)
lm_eqn <- function(x, y){
m <- lm(y ~ x)
output <- paste("r.squared = ", round(summary(m)$adj.r.squared, digits = 4), " | p.value = ", formatC(summary(m)$coefficients[8], format = "e", digits = 4))
return(output)
}
x <-c(1,2,5,2,3,6,7,0)
y <-c(2,3,5,9,8,3,3,1)
final <- data_frame(x,y)
output_plot <- lm_eqn(x, y)
p1 <- ggplot(final, aes(x=x, y= y)) + geom_point() + geom_smooth(method=lm, se=FALSE) + labs(x = "x", y = "y") + ggtitle("CD8 v/s CTLA-4", subtitle = paste("Linear Regression of Expression |", output_plot))

ggplot2 boxplot: horizontal bar at median?

I would like to make a ggplot2 boxplot more meaningful by adding a thick bar at the median (so that if the median is equal to either of the lower or upper quartiles, it can be detected to which it is equal). I came across a recent post of Kohske:
Can I get boxplot notches in ggplot2?
but I didn't know how to give the "crossbar" a "height". Then I tried
to use a rectangle but it didn't work either. Here is a minimal example:
require(ggplot2)
require(reshape2)
require(plyr)
set.seed(1)
## parameters
p1 <- c(5, 20, 100)
p2 <- c("f1", "f2", "f3", "f4", "f5")
p3 <- c("g1","g2","g3","g4","g5")
N <- 1000
## lengths
l1 <- length(p1)
l2 <- length(p2)
l3 <- length(p3)
## build result array containing the measurements
arr <- array(rep(NA, l1*l2*l3*N), dim=c(l1, l2, l3, N),
dimnames=list(
p1=p1,
p2=p2,
p3=p3,
N=1:N))
for(i in 1:l1){
for(j in 1:l2){
for(k in 1:l3){
arr[i,j,k,] <- i+j+k+runif(N, min=-4, max=4)
}
}
}
arr <- arr + rexp(3*5*5*N)
## create molten data
mdf <- melt(arr, formula = . ~ p1 + p2 + p3 + N) # create molten data frame
## confidence interval calculated by `boxplot.stats`
f <- function(x){
ans <- boxplot.stats(x)
data.frame(x=x, y=ans$stats[3], ymin=ans$conf[1], ymax=ans$conf[2])
}
## (my poor) trial
ggplot(mdf, aes(x=p3, y=value)) + geom_boxplot(outlier.shape=1) +
stat_summary(fun.data=f, geom="rectangle", colour=NA, fill="black",
xmin=x-0.36, xmax=x+0.36, ymin=max(y-0.2, ymin), ymax=min(y+0.2,
ymax)) + facet_grid(p2 ~ p1, scales = "free_y")
**SOLUTION** (after the discussion with Kohske below):
f <- function(x, height){
ans <- median(x)
data.frame(y=ans, ymin=ans-height/2, ymax=ans+height/2)
}
p <- ggplot(mdf, aes(x=p3, y=value)) + geom_boxplot(outlier.shape=1) +
stat_summary(fun.data=f, geom="crossbar", height=0.5, colour=NA,
fill="black", width=0.78) +
facet_grid(p2 ~ p1, scales = "free_y")
pdf()
print(p)
dev.off()
**UPDATE** Hmmm... it's not that trivial. The following example shows that the "height" of the crossbar should be adapted to the y-axis scale, otherwise it might be overseen.
require(ggplot2)
require(reshape2)
require(plyr)
set.seed(1)
## parameters
p1 <- c(5, 20, 100)
p2 <- c("f1", "f2", "f3", "f4", "f5")
p3 <- c("g1","g2","g3","g4","g5")
N <- 1000
## lengths
l1 <- length(p1)
l2 <- length(p2)
l3 <- length(p3)
## build result array containing the measurements
arr <- array(rep(NA, l1*l2*l3*N), dim=c(l1, l2, l3, N),
dimnames=list(
p1=p1,
p2=p2,
p3=p3,
N=1:N))
for(i in 1:l1){
for(j in 1:l2){
for(k in 1:l3){
arr[i,j,k,] <- i+j^4+k+runif(N, min=-4, max=4)
}
}
}
arr <- arr + rexp(3*5*5*N)
arr[1,2,5,] <- arr[1,2,5,]+30
arr[1,5,3,] <- arr[1,5,3,]+100
## create molten data
mdf <- melt(arr, formula = . ~ p1 + p2 + p3 + N) # create molten data frame
f <- function(x, height){
ans <- median(x)
data.frame(y=ans, ymin=ans-height/2, ymax=ans+height/2)
}
## plot
p <- ggplot(mdf, aes(x=p3, y=value)) + geom_boxplot(outlier.shape=1) +
stat_summary(fun.data=f, geom="crossbar", height=0.7, colour=NA,
fill="black", width=0.78) +
facet_grid(p2 ~ p1, scales = "free_y")
pdf()
print(p)
dev.off()
here is an example:
f <- function(x, height) {
ans <- median(x)
data.frame(ymin = ans-height/2, ymax = ans+height/2, y = ans)
}
df <- data.frame(x=gl(2,6), y=c(1,1,1,1,3,3, 1,1,3,3,3,3))
ggplot(df, aes(x, y)) + geom_boxplot() +
stat_summary(fun.data = f, geom = "crossbar", height = 0.1,
colour = NA, fill = "skyblue", width = 0.8, alpha = 0.5)
if you just want to change the apparence, then here is a quick hack, I don't recommend though,
df <- data.frame(x=gl(2,6), y=c(c(1,1,1,1,3,3), c(1,1,3,3,3,3)*10))
ggplot(df, aes(x, y)) + geom_boxplot() + facet_grid(x~.)
gs <- grid.gget("geom_boxplot", grep = T)
if (inherits(gs, "grob")) gs <- list(gs)
gss <- llply(gs, function(g) g$children[[length(g$children)]])
l_ply(gss, function(g) grid.edit(g$name, grep=T, just = c("left", "center"), height = unit(0.05, "native"), gp = gpar(fill = "skyblue", alpha = 0.5, col = NA)))

Resources