I have more legend items due to a time course but I only want to show two labels : Healthy and Patient.
## Construct the data
healthy <- rnorm(100,mean=0,sd=1)
patientTimeA <- rnorm(100,mean=1,sd=1)
patientTimeB <- rnorm(100,mean=3,sd=1)
patientTimeC <- rnorm(100,mean=4,sd=1)
groupArray <- c(rep("H",100),rep("P",300))
timeArray <- c(rep(0,100),rep(10,100),rep(20,100),rep(30,100))
dataTab <- data.frame(group=groupArray,time=timeArray,value=c(healthy,patientTimeA,patientTimeB,patientTimeC))
dataTab$group <- paste(dataTab$group,dataTab$time,sep="_")
colVector <- c("red","blue","blue","blue")
p <- ggplot(dataTab,aes(x=group,y=value,fill=group)) +
geom_boxplot(alpha=0.7)+
scale_fill_manual(values=colVector)
print(p)
You need to create two different variables in your dataTab for the X and the fill aesthetics:
healthy <- rnorm(100, mean = 0, sd = 1)
patientTimeA <- rnorm(100, mean = 1, sd = 1)
patientTimeB <- rnorm(100, mean = 3, sd = 1)
patientTimeC <- rnorm(100, mean = 4, sd = 1)
groupArray <- c(rep("H", 100),rep("P", 300))
timeArray <- c(rep(0, 100), rep(10, 100), rep(20, 100), rep(30, 100))
dataTab <- data.frame(group = groupArray, time = timeArray,
value = c(healthy, patientTimeA, patientTimeB, patientTimeC))
dataTab$group1 <- paste(dataTab$group, dataTab$time, sep = "_")
colVector <- c("red", "blue", "blue", "blue")
p <- ggplot(dataTab, aes(x = group1, y = value, fill = group)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = colVector)
print(p)
Related
I am trying to get two contours in the same plot using ggplot2 in R.
Here is a reproducible example:
library(MASS)
library(ggplot2)
# first contour
m <- c(.0, -.0)
sigma <- matrix(c(1,.5,.5,1), nrow=2)
data.grid <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp <- cbind(data.grid, prob = mvtnorm::dmvnorm(data.grid, mean = m, sigma = sigma))
plot1 <- ggplot(q.samp, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'green')
# second contour
m1 <- c(1, 1)
sigma1 <- matrix(c(1,-.5,-.5,1), nrow=2)
set.seed(10)
data.grid1 <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp1 <- cbind(data.grid1, prob = mvtnorm::dmvnorm(data.grid1, mean = m1, sigma = sigma1))
plot2 <- ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red')
However, trying plot1 + plot2 also does not work. Is there a way to get the two contours on the same plot.
What about including another stat_contour with different data?
ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red') +
stat_contour(data = q.samp, aes(x = s.1, y = s.2, z = prob), color = 'green')
I am trying to make a composite plot in R using the packages ggplot2 and ggpubr.
I have no problem in making the composite plots except each plot has a normal distribution curve specific to that dataset. When I generate the composite plot, both plots have the same curve, that of the last dataset.
How can I generate the composite plot with each plot having its own specific normal distribution curve?
CODE AND OUTPUT PLOTS
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
m <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
dnorm_C <- function(x){
norm_C <- dnorm(x, m, sd)
return(norm_C)
}
e = 1
dnorm_one_sd_C <- function(x){
norm_one_sd_C <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)
}
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_C, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_C, colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
m <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
dnorm_U <- function(x){
norm_U <- dnorm(x, m, sd)
return(norm_U)
}
e = 2
dnorm_one_sd_U <- function(x){
norm_one_sd_U <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)
}
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_U, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_U, colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)
As you can see in the composite plot, the first one has taken the normal distribution curve of the second plot rather than its own one from my initial plot (Plot 1).
UPDATE
Variable "e" refers to the shaded area which is related to the distribution curve.
m = mean of the dataset
sd = standard deviation of the dataset
m and sd are used to generate the normal distribution curves
SOLVED
By inserting the function in full into the stat_function section of the ggplot2 code, this has worked
i.e:
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
mean <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
e = 1
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_C <- dnorm(x, mean, sd)
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
mean <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
e = 2
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_U <- dnorm(x, mean, sd)
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)
Sample data
set.seed(123)
par(mfrow = c(1,2))
dat <- data.frame(years = rep(1980:2014, each = 8), x = sample(1000:2000, 35*8 ,replace = T))
boxplot(dat$x ~ dat$year, ylim = c(500, 4000))
I have another dataset that has a single value for some selected years
ref.dat <- data.frame(years = c(1991:1995, 2001:2008), x = sample(1000:2000, 13, replace = T))
plot(ref.dat$years, ref.dat$x, type = "b")
How can I add the line plot on top of the boxplot
With ggplot2 you could do this:
ggplot(dat, aes(x = years, y = x)) +
geom_boxplot(data = dat, aes(group = years)) +
geom_line(data = ref.dat, colour = "red") +
geom_point(data = ref.dat, colour = "red", shape = 1) +
coord_cartesian(ylim = c(500, 4000)) +
theme_bw()
The trick here is to figure out the x-axis on the boxplot. You have 35 boxes and they are plotted at the x-coordinates 1, 2, 3, ..., 35 - i.e. year - 1979. With that, you can add the line with lines as usual.
set.seed(123)
dat <- data.frame(years = rep(1980:2014, each = 8),
x = sample(1000:2000, 35*8 ,replace = T))
boxplot(dat$x ~ dat$year, ylim = c(500, 2500))
ref.dat <- data.frame(years = c(1991:1995, 2001:2008),
x = sample(1000:2000, 13, replace = T))
lines(ref.dat$years-1979, ref.dat$x, type = "b", pch=20)
The points were a bit hard to see, so I changed the point style 20. Also, I used a smaller range on the y-axis to leave less blank space.
Not sure if title appropriately explains my problem: Using facet wraps, I have one plot which is a summary of the other plots, this results in that plot having much higher values on the variable which I plot colors by. All other plots thus only show (mostly) one color: Example graph.
I wish the colorization to rather depend on the non-summary graphs, whereas I care less if the summary graph ends up only displaying mostly one color because of consisting of high values (lower values should still show appropriate colors though). Thankful for any help.
Example code:
library(gdata)
library(viridis)
library(ggplot2)
summary <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 15, max = 200))
dat2 <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 0, max = 30))
dat3 <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 0, max = 30))
dat4 <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 0, max = 30))
dat <- combine(summary, dat2, dat3, dat4)
p <- ggplot(dat, aes(x = x, y = y)) +
geom_point(aes(color = z)) +
scale_color_viridis(option = "magma") +
theme(panel.background = element_rect(fill = "grey93"),
strip.background = element_blank(),
panel.grid = element_blank(),
panel.border = element_rect(fill = NA, colour = "black", size = 0.5, linetype = "solid"))`
p1 <- p + facet_wrap(~source)
summary <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 15, max = 200))
dat2 <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 0, max = 30))
dat3 <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 0, max = 30))
dat4 <- data.frame(x = 1:15, y = runif(15), z = runif(15, min = 0, max = 30))
dat <- combine(summary, dat2, dat3, dat4)
dat_ <- subset(dat, source != "summary")
cut_off <- 70 # adjust here where you want the cut-off of z values to be.
summary2 <- subset(dat, source == "summary" & z < cut_off)
p <- ggplot(dat_, aes(x = x, y = y)) +
geom_point(aes(color = z)) +
geom_point(data = summary2, aes(col = z)) +
scale_color_viridis(option = "magma") +
theme(panel.background = element_rect(fill = "grey93"),
strip.background = element_blank(),
panel.grid = element_blank(),
panel.border = element_rect(fill = NA, colour = "black", size = 0.5, linetype = "solid"))
p1 <- p + facet_wrap(~source)
summary3 <- subset(dat, source == "summary" & z >= cut_off)
p1 + geom_point(data = summary3, col = "red") # adjust the color aesthetic of the highest summary graph points here.
So what you'll see is, summary data with a z value greater than or equal to 70 colored "red" in the summary graph, so they contrast with the viridis magma scale. (Experiment with "white", it contrasts nicely, too). All summary data observations with a z values less than 70 will follow the same color scale as the other three graphs, in the summary graph. Lower the cut_off variable (that is currently set to 70) to get more color in the dat2, dat3, and dat4 graphs.
This is the exact answer you've been looking for.
My question is similar to this but the answers there will not work for me. Basically, I'm trying to produce a regression discontinuity plot with a "fuzzy" design that uses all the data for the treatment and control groups, but only plots the regression line within the "range" of the treatment and control groups.
Below, I've simulated some data and produced the fuzzy RD plot with base graphics. I'm hoping to replicate this plot with ggplot2. Note that the most important part of this is that the light blue regression line is fit using all the blue points, while the peach colored regression line is fit using all the red points, despite only being plotted over the ranges in which individuals were intended to receive treatment. That's the part I'm having a hard time replicating in ggplot.
I'd like to move to ggplot because I'd like to use faceting to produce this same plot across various units in which participants were nested. In the code below, I show a non-example using geom_smooth. When there's no fuzziness within a group, it works fine, but otherwise it fails. If I could get geom_smooth to be limited to only specific ranges, I think I'd be set. Any and all help is appreciated.
Simulate data
library(MASS)
mu <- c(0, 0)
sigma <- matrix(c(1, 0.7, 0.7, 1), ncol = 2)
set.seed(100)
d <- as.data.frame(mvrnorm(1e3, mu, sigma))
# Create treatment variable
d$treat <- ifelse(d$V1 <= 0, 1, 0)
# Introduce fuzziness
d$treat[d$treat == 1][sample(100)] <- 0
d$treat[d$treat == 0][sample(100)] <- 1
# Treatment effect
d$V2[d$treat == 1] <- d$V2[d$treat == 1] + 0.5
# Add grouping factor
d$group <- gl(9, 1e3/9)
Produce regression discontinuity plot with base
library(RColorBrewer)
pal <- brewer.pal(5, "RdBu")
color <- d$treat
color[color == 0] <- pal[1]
color[color == 1] <- pal[5]
plot(V2 ~ V1,
data = d,
col = color,
bty = "n")
abline(v = 0, col = "gray", lwd = 3, lty = 2)
# Fit model
m <- lm(V2 ~ V1 + treat, data = d)
# predicted achievement for treatment group
pred_treat <- predict(m,
newdata = data.frame(V1 = seq(-3, 0, 0.1),
treat = 1))
# predicted achievement for control group
pred_no_treat <- predict(m,
newdata = data.frame(V1 = seq(0, 4, 0.1),
treat = 0))
# Add predicted achievement lines
lines(seq(-3, 0, 0.1), pred_treat, col = pal[4], lwd = 3)
lines(seq(0, 4, 0.1), pred_no_treat, col = pal[2], lwd = 3)
# Add legend
legend("bottomright",
legend = c("Treatment", "Control"),
lty = 1,
lwd = 2,
col = c(pal[4], pal[2]),
box.lwd = 0)
non-example with ggplot
d$treat <- factor(d$treat, labels = c("Control", "Treatment"))
library(ggplot2)
ggplot(d, aes(V1, V2, group = treat)) +
geom_point(aes(color = treat)) +
geom_smooth(method = "lm", aes(color = treat)) +
facet_wrap(~group)
Notice the regression lines extending past the treatment range for groups 1 and 2.
There's probably a more graceful way to make the lines with geom_smooth, but it can be hacked together with geom_segment. Munge the data.frames outside of the plotting call if you like.
ggplot(d, aes(x = V1, y = V2, color = factor(treat, labels = c('Control', 'Treatment')))) +
geom_point(shape = 21) +
scale_color_brewer(NULL, type = 'qual', palette = 6) +
geom_vline(aes(xintercept = 0), color = 'grey', size = 1, linetype = 'dashed') +
geom_segment(data = data.frame(t(predict(m, data.frame(V1 = c(-3, 0), treat = 1)))),
aes(x = -3, xend = 0, y = X1, yend = X2), color = pal[4], size = 1) +
geom_segment(data = data.frame(t(predict(m, data.frame(V1 = c(0, 4), treat = 0)))),
aes(x = 0, xend = 4, y = X1, yend = X2), color = pal[2], size = 1)
Another option is geom_path:
df <- data.frame(V1 = c(-3, 0, 0, 4), treat = c(1, 1, 0, 0))
df <- cbind(df, V2 = predict(m, df))
ggplot(d, aes(x = V1, y = V2, color = factor(treat, labels = c('Control', 'Treatment')))) +
geom_point(shape = 21) +
geom_vline(aes(xintercept = 0), color = 'grey', size = 1, linetype = 'dashed') +
scale_color_brewer(NULL, type = 'qual', palette = 6) +
geom_path(data = df, size = 1)
For the edit with facets, if I understand what you want correctly, you can calculate a model for each group with lapply and predict for each group. Here I'm recombine with dplyr::bind_rows instead of do.call(rbind, ...) for the .id parameter to insert the group number from the list element name, though there are other ways to do the same thing.
df <- data.frame(V1 = c(-3, 0, 0, 4), treat = c('Treatment', 'Treatment', 'Control', 'Control'))
m_list <- lapply(split(d, d$group), function(x){lm(V2 ~ V1 + treat, data = x)})
df <- dplyr::bind_rows(lapply(m_list, function(x){cbind(df, V2 = predict(x, df))}), .id = 'group')
ggplot(d, aes(x = V1, y = V2, color = treat)) +
geom_point(shape = 21) +
geom_vline(aes(xintercept = 0), color = 'grey', size = 1, linetype = 'dashed') +
geom_path(data = df, size = 1) +
scale_color_brewer(NULL, type = 'qual', palette = 6) +
facet_wrap(~group)