making bargraph ggplot in ggplot loop - r

this question is an extension of a previous posting: ggplot and loops
I was using the example above to generate bar graphs in a loop. I modified the above example, to generate a bar graph and corresponding error bars. I somewhat succeeded, however the error bars do not populate according to the individual variable.
I would appreciate the help very much!
My modifications:
#make a dummy dataframe
D <- data.frame(
x1 = runif(20),
x2 = rnorm(20),
x1_se = runif(20, 0.01, 0.09),
x2_se = runif(20, -1, 1),
treatment = rep(c("control","test"), each = 10)
)
# for reference later
p_names <- c("treatment","x1","x2")
se_names <- c("treatment","x1_se","x2_se")
trt <- rep(c("control","test"), each = 10)
# subset the standard error into its own dataframe
se <- D[,se_names]
names(se) <- str_remove(names(se), "_se")
plots <- list()
# the loop
for(nm in p_names) {
#trt <- trt
plots[[nm]] <- ggplot(data= D, aes(x = trt, fill = trt)) +
geom_bar(aes_string(y = D[[nm]]), stat="identity", position = "dodge", color = "black") +
geom_errorbar(aes(ymin= D[[nm]] - se[[nm]],
ymax= D[[nm]] + se[[nm]]), position=position_dodge(.9)) + ylab(nm)
}
print(plots[["x1"]])
print(plots[["x2"]])
```

It doesn't make sense to have one s.e per observation, if you are trying to plot a mean and se barplot for each column, do the below:
p_names = c("x1","x2")
for(nm in p_names) {
plots[[nm]] <- ggplot(data= D,aes_string(x ="treatment",y=nm,fill="treatment"))+
stat_summary(fun.y=mean,color = "black",geom="bar") +
stat_summary(fun.data=mean_se,geom="errorbar",width=0.2)
}

Related

How to make for-loop work in order to automate plotting

I have
a data.frame df
df = data.frame(year=c(2018,2019,2020), value1=rnorm(3,1,0.5), value2=rnorm(3,2,0.5)
a ggplot-function called ScatterPlot (function code see below)
a for loop that I want to use to run the ggplot-function over my df
My intent is to plot (scatter) value1 over years and value2 over years somewhat automatically (using scatterplot function and my for loop).
For some reason, the for loop below only generates one plot (the last one in my df). Can someone tell me what I am missing?
for loop:
# Create the loop.vector (all the columns)
loop.vector <- ncol(df)-1
for (i in loop.vector) { # Loop over loop.vector
# store data in column.i as x
x <- df[i]
x = unlist(x) #necessary. otherwise ggplot will generate an error
plotname = colnames(df[i])
#plot
jpeg(filename=paste0("/R-Outputs/plots/",plotname,".jpeg"))
plot= ScatterPlot(df,df$year,"year", x, plotname)
print(plot)
dev.off()
}
Scatterplot Function (this works):
ScatterPlot <- function(df, x, x_var_label,y, y_var_label) {
# Input:
# df: a data frame
# x: a column from df in the form of a character vector
# y: a column from df in the form of a character vector
#
# Output:
# a ggplot2 plot
require(ggplot2)
x_title = x_var_label
y_title = y_var_label
time_labels = c("2018", "2019", "2020")
ggplot(data = df, aes(x = x, y = y)) +
geom_point(col="#69b3a2",fill="#69b3a2",alpha=0.5, size = 0) +
geom_line()+
geom_smooth(method = "lm", se = FALSE, size = 0.8, col="red") +
xlab(label = x_title) +
ylab(label = y_title) +
theme_bw()+
theme(axis.text.x=element_text(angle=45, hjust = 1))+
labs(title = paste0(y_title," over time"))+
scale_x_continuous("year", labels = as.character(time_labels),
breaks = as.integer((time_labels)))
}
You don't need to pass both values as well as column name. Pass only the column name in the function ScatterPlot.
library(ggplot2)
ScatterPlot <- function(df, x_var_label,y_var_label) {
# Input:
# df: a data frame
# x: a column from df in the form of a character vector
# y: a column from df in the form of a character vector
#
# Output:
# a ggplot2 plot
time_labels = c("2018", "2019", "2020")
ggplot(data = df, aes(x = .data[[x_var_label]], y = .data[[y_var_label]])) +
geom_point(col="#69b3a2",fill="#69b3a2",alpha=0.5, size = 0) +
geom_line()+
geom_smooth(method = "lm", se = FALSE, size = 0.8, col="red") +
xlab(label = x_var_label) +
ylab(label = y_var_label) +
theme_bw()+
theme(axis.text.x=element_text(angle=45, hjust = 1))+
labs(title = paste0(y_var_label," over time"))+
scale_x_continuous("year", labels = time_labels,
breaks = as.integer(time_labels))
}
To call this function in a loop something like this should work.
#column names to loop over
loop.vector <- names(df[-1])
plot <- vector('list', length(loop.vector))
for (i in seq_along(loop.vector)) { # Loop over loop.vector
jpeg(filename=paste0("/R-Outputs/plots/",loop.vector[i],".jpeg"))
plot[[i]] = ScatterPlot(df,"year", loop.vector[i])
print(plot[[i]])
dev.off()
}
We are also saving individual plots in a list which you can verify with plot[[1]], plot[[2]] etc.

Residual plot with ggplot with X-axis as "ranked" residuals

I'm trying to re-create a plot like this in ggplot:.
This graph takes the residuals from a regression output, and plots them in order (with the X-axis being a rank of residuals).
My best attempt at this was something like the following:
library(ggplot2)
library(modelr)
d <- d %>% add_residuals(mod1, var = "resid")
d$resid_rank <- rank(d$resid)
ggplot(data = d, aes(x = resid_rank, y = resid)) +
geom_bar(stat="identity") +
theme_bw()
However, this yields a completely blank graph. I tried something like this:
ggplot(data = d, aes(x = resid_rank, y = resid)) +
geom_segment(yend = 0, aes(xend=resid)) +
theme_bw()
But this yields the segments that go in the wrong direction. What is the right way to do this, and to color those lines by a third factor?
FAKE DATASET:
library(estimatr)
library(fabricatr)
#simulation
dat <- fabricate(
N = 10000,
y = runif(N, 0, 10),
x = runif(N, 0, 100)
)
#add an outlier
dat <- rbind(dat, c(300, 5))
dat <- rbind(dat, c(500, 3))
dat$y_log <- log(dat$y)
dat$x_log <- log(dat$x)
dat$y_log_s <- scale(log(dat$y))
dat$x_log_s <- scale(log(dat$x))
mod1 <- lm(y_log ~ x_log, data = dat))
I used the build in dataset from the help page on lm() to create this example. I also just directly used resid() to get the residuals. It's unclear where / why the colored bars would be different, but basically you'd need to add a column to your data.frame that specificies why they are red or blue, then pass that to fill.
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 3.4.4
#example from lm
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)
lm.D9 <- lm(weight ~ group)
resids <- data.frame(resid = resid(lm.D9))
#why are some bars red and some blue? No clue - so I'll pick randomly
resids$group <- sample(c("group 1", "group 2"), nrow(resids), replace = TRUE)
#rank
resids$rank <- rank(-1 * resids$resid)
ggplot(resids, aes(rank, resid, fill = group)) +
geom_bar(stat = "identity", width = 1) +
geom_hline(yintercept = c(-1,1), colour = "darkgray", linetype = 2) +
geom_hline(yintercept = c(-2,2), colour = "lightgray", linetype = 1) +
theme_bw() +
theme(panel.grid = element_blank()) +
scale_fill_manual(values = c("group 1" = "red", "group 2" = "blue"))
Created on 2019-01-24 by the reprex package (v0.2.1)

Split violin plot with points on top to indicate info

This is a followup post from here
and here
I have successfully implemented the split violin ggplot2 for my data (two median estimator densities, for two cases) that need to be compared. Now, since i would like to add some confidence interval. I m following the code posted in the links above:
EDIT: A reproducible example
tmp <- rnorm(1000,0,1)
tmp.2 <- rnorm(1000,0,1)
x.1 <- density(tmp)
y.1 <- density(tmp.2)
Here, i m making the densities, extracting the (x,y) pairs. Then i m getting the quantiles back,
# Make densities
densities <- as.data.frame(c(x.1$x,y.1$x))
colnames(densities) <- "loc"
densities$dens <- c(x.1$y,y.1$y)
densities$drop_case <- c(rep("B",512),rep("S",512))
densities$dens <- ifelse(densities$drop_case=="B",densities$dens*-1,densities$dens)
densities$dens <- ifelse(densities$drop_case=="S",densities$dens*1,densities$dens)
conf <- as.data.frame(c(quantile(tmp,c(0.025,0.975))[1],quantile(tmp,c(0.025,0.975))[2],quantile(tmp.2,c(0.025,0.975))[1],quantile(tmp.2,c(0.025,0.975))[2]))
colnames(conf) <- "intervals"
conf$drop_case <- c(rep("B",2),rep("S",2))
conf$length <- rep(1000,4)
Now here i am trying to extract the values inside the densities, as was noted in the linked posts
Find data points in densities
val.tmp <- rep(0,4)
val.tmp.2 <- rep(0,4)
for (i in 1:4) {
x.here <- densities$loc
y.here <- densities$dens
your.number<- conf$intervals[i]
pos.tmp <- which(abs(x.here-your.number)==min(abs(x.here-your.number)))
val.tmp[i] <- x.here[pos.tmp]
val.tmp.2[i] <- y.here[pos.tmp]
}
conf$positions <- val.tmp
conf$length <- val.tmp.2
conf$length <- ifelse(conf$drop_case=="B",conf$length*-1,conf$length)
conf$length <- ifelse(conf$drop_case=="S",conf$length*1,conf$length)
ggplot(densities,aes(dens, loc, fill = factor(drop_case)))+
geom_polygon()+
scale_x_continuous(breaks = 0, name = info$Name)+
ylab('Estimator Density') +
theme(axis.title.x = element_blank())+
geom_point(data = conf, aes(x = positions, y = length, fill = factor(drop_case), group = factor(drop_case))
,shape = 21, colour = "black", show.legend = FALSE)
Then unfortuantely I am facing the following, the points are not mapped on the densities but are rather mapped on the plane.
There is a bunch of little mistakes in the code. Firstly, within that for loop, you can't set x.here and y.here to all of the density and location values, since that includes both groups. Secondly, since the signs are already changed in densities there is no need to use those ifelse statements afterwards. Thirdly, you would only need the top ifelse anyway, since the bottom one does absolutely nothing. Finally, you had the x and y mappings in geom_point the wrong way around!
There is a bunch of other things one could change to make the code more understandable and pretty, but I'm on limited time, so I'll leave those for what they are.
Below the full adjusted code:
tmp <- rnorm(1000,0,1)
tmp.2 <- rnorm(1000,0,1)
x.1 <- density(tmp)
y.1 <- density(tmp.2)
# Make densities
densities <- as.data.frame(c(x.1$x,y.1$x))
colnames(densities) <- "loc"
densities$dens <- c(x.1$y,y.1$y)
densities$drop_case <- c(rep("B",512),rep("S",512))
densities$dens <- ifelse(densities$drop_case=="B",densities$dens*-1,densities$dens)
conf <- as.data.frame(c(quantile(tmp,c(0.025,0.975)), quantile(tmp.2,c(0.025,0.975))))
colnames(conf) <- "intervals"
conf$drop_case <- c(rep("B",2),rep("S",2))
conf$length <- rep(1000,4)
val.tmp <- rep(0,4)
val.tmp.2 <- rep(0,4)
for (i in 1:4) {
x.here <- densities$loc[densities$drop_case == conf$drop_case[i]]
y.here <- densities$dens[densities$drop_case == conf$drop_case[i]]
your.number<- conf$intervals[i]
pos.tmp <- which(abs(x.here-your.number)==min(abs(x.here-your.number)))
val.tmp[i] <- x.here[pos.tmp]
val.tmp.2[i] <- y.here[pos.tmp]
}
conf$positions <- val.tmp
conf$length <- val.tmp.2
ggplot(densities, aes(dens, loc, fill = drop_case)) +
geom_polygon()+
ylab('Estimator Density') +
theme(axis.title.x = element_blank())+
geom_point(data = conf, aes(x = length, y = positions, fill = drop_case),
shape = 21, colour = "black", show.legend = FALSE)
This results in:
I would personally prefer a plot with line segments:
ggplot(densities, aes(dens, loc, fill = factor(drop_case)))+
geom_polygon()+
ylab('Estimator Density') +
theme(axis.title.x = element_blank())+
geom_segment(data = conf, aes(x = length, xend = 0, y = positions, yend = positions))

R ggplot2::geom_density with a constant variable

I have recently came across a problem with ggplot2::geom_density that I am not able to solve. I am trying to visualise a density of some variable and compare it to a constant. To plot the density, I am using the ggplot2::geom_density. The variable for which I am plotting the density, however, happens to be a constant (this time):
df <- data.frame(matrix(1,ncol = 1, nrow = 100))
colnames(df) <- "dummy"
dfV <- data.frame(matrix(5,ncol = 1, nrow = 1))
colnames(dfV) <- "latent"
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.2, position = "identity") +
geom_vline(data = dfV, aes(xintercept = latent, color = 'ls'), size = 2)
This is OK and something I would expect. But, when I shift this distribution to the far right, I get a plot like this:
df <- data.frame(matrix(71,ncol = 1, nrow = 100))
colnames(df) <- "dummy"
dfV <- data.frame(matrix(75,ncol = 1, nrow = 1))
colnames(dfV) <- "latent"
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.2, position = "identity") +
geom_vline(data = dfV, aes(xintercept = latent, color = 'ls'), size = 2)
which probably means that the kernel estimation is still taking 0 as the centre of the distribution (right?).
Is there any way to circumvent this? I would like to see a plot like the one above, only the centre of the kerner density would be in 71 and the vline in 75.
Thanks
Well I am not sure what the code does, but I suspect the geom_density primitive was not designed for a case where the values are all the same, and it is making some assumptions about the distribution that are not what you expect. Here is some code and a plot that sheds some light:
# Generate 10 data sets with 100 constant values from 0 to 90
# and then merge them into a single dataframe
dfs <- list()
for (i in 1:10){
v <- 10*(i-1)
dfs[[i]] <- data.frame(dummy=rep(v,100),facet=v)
}
df <- do.call(rbind,dfs)
# facet plot them
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.5, position = "identity") +
facet_wrap( ~ facet,ncol=5 )
Yielding:
So it is not doing what you thought it was, but it is also probably not doing what you want. You could of course make it "translation-invariant" (almost) by adding some noise like this for example:
set.seed(1234)
noise <- +rnorm(100,0,1e-3)
dfs <- list()
for (i in 1:10){
v <- 10*(i-1)
dfs[[i]] <- data.frame(dummy=rep(v,100)+noise,facet=v)
}
df <- do.call(rbind,dfs)
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.5, position = "identity") +
facet_wrap( ~ facet,ncol=5 )
Yielding:
Note that there is apparently a random component to the geom_density function, and I can't see how to set the seed before each instance, so the estimated density is a bit different each time.

Bar plot of group means with lines of individual results overlaid

this is my first stack overflow post and I am a relatively new R user, so please go gently!
I have a data frame with three columns, a participant identifier, a condition (factor with 2 levels either Placebo or Experimental), and an outcome score.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
I would like to construct a bar plot with two bars with the mean outcome score for each condition and the standard deviation as an error bar. I would like to then overlay lines connecting points for each participant's score in each condition. So the plot displays the individual response as well as the group mean.If it is also possible I would like to include an axis break.
I don't seem to be able to find any advice in other threads, apologies if I am repeating a question.
Many Thanks.
p.s. I realise that presenting data in this way will not be to everyones tastes. It is for a specific requirement!
This ought to work:
library(ggplot2)
library(dplyr)
dat.summ <- dat %>% group_by(Condition) %>%
summarize(mean.outcome = mean(Outcome),
sd.outcome = sd(Outcome))
ggplot(dat.summ, aes(x = Condition, y = mean.outcome)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = mean.outcome - sd.outcome,
ymax = mean.outcome + sd.outcome),
color = "dodgerblue", width = 0.3) +
geom_point(data = dat, aes(x = Condition, y = Outcome),
color = "firebrick", size = 1.2) +
geom_line(data = dat, aes(x = Condition, y = Outcome, group = ID),
color = "firebrick", size = 1.2, alpha = 0.5) +
scale_y_continuous(limits = c(0, max(dat$Outcome)))
Some people are better with ggplot's stat functions and arguments than I am and might do it differently. I prefer to just transform my data first.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
dat.w <- reshape(dat, direction = 'wide', idvar = 'ID', timevar = 'Condition')
means <- colMeans(dat.w[, 2:3])
sds <- apply(dat.w[, 2:3], 2, sd)
ci.l <- means - sds
ci.u <- means + sds
ci.width <- .25
bp <- barplot(means, ylim = c(0,20))
segments(bp, ci.l, bp, ci.u)
segments(bp - ci.width, ci.u, bp + ci.width, ci.u)
segments(bp - ci.width, ci.l, bp + ci.width, ci.l)
segments(x0 = bp[1], x1 = bp[2], y0 = dat.w[, 2], y1 = dat.w[, 3], col = 1:10)
points(c(rep(bp[1], 10), rep(bp[2], 10)), dat$Outcome, col = 1:10, pch = 19)
Here is a method using the transfomations inside ggplot2
ggplot(dat) +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.y="mean", geom="bar") +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.data="mean_se", geom="errorbar", col="green", width=.8, size=2) +
geom_line(aes(x=Condition, y=Outcome, group=ID), col="red")

Resources