Looping ggplot categorical variables - r

I am a noob, so hope this makes sense...
Question/problem statement
I need to create a number of plots, where the only difference in each of the plots is the group used - each group contains categorical variables. I have got this to work by manually typing out all of the code.
Instead of manually writing each of the groups into R, I want to develop a loop to automate this plotting process.
Current manual method
This works, but is tedious and I want to automate through a loop - just an example with 2 of my 9 groups.
The only thing that changes in each is the factor and titles
# GOR
ggplot(aes(y = dailyCV, x = factor(GOR)), data = mergedbed) +
geom_jitter(alpha=1/2, color="tomato", position=position_jitter(width=.2), size=1/10) +
stat_summary(fun.data = min.mean.sd.max, geom = "boxplot", alpha = 0.5) +
stat_summary(fun.y=mean, colour="black", geom="text",
vjust=0.5, hjust=1.5, size=3, aes( label=round(..y.., digits=1))) +
stat_summary(fun.data = give.n, geom = "text", vjust=1, hjust=-2, size=3) +
coord_flip() +
stat_summary(fun.y = mean, geom="point",colour="darkred", size=2) +
xlab("GOR")+
ylab("Co-efficient of variation (%)")+
ggtitle("GOR vs dailyCV")
# ACCOM_EHCS
ggplot(aes(y = dailyCV, x = factor(ACCOM_EHCS)), data = mergedbed) +
geom_jitter(alpha=1/2, color="tomato", position=position_jitter(width=.2), size=1/10) +
stat_summary(fun.data = min.mean.sd.max, geom = "boxplot", alpha = 0.5) +
stat_summary(fun.y=mean, colour="black", geom="text",
vjust=0.5, hjust=1.5, size=3, aes( label=round(..y.., digits=1))) +
stat_summary(fun.data = give.n, geom = "text", vjust=1, hjust=-2, size=3) +
coord_flip() +
stat_summary(fun.y = mean, geom="point",colour="darkred", size=2) +
xlab("ACCOM_EHCS")+
ylab("Co-efficient of variation (%)")+
ggtitle("ACCOM_EHCS vs dailyCV")
My attempt
My attempt here was to create a vector with each of the groups and then try to loop this, but it doesnt work and Im sure its very wrong. My first time at attempting to create a loop.
myvariables <- c("GOR","ACCOM_EHCS","DBL_GLAZ", "BUILDING_AGE", "HhdSize", "Inc_Group_7s", "Person_Under_5", "Person_Over_64", "thermal")
lapply(myvariables, function(cc){
p <- ggplot(aes(y = dailyCV, x = factor(aes_string(y = cc))), data = mergedbed) +
geom_jitter(alpha=1/2, color="tomato", position=position_jitter(width=.2), size=1/10) +
stat_summary(fun.data = min.mean.sd.max, geom = "boxplot", alpha = 0.5) +
stat_summary(fun.y=mean, colour="black", geom="text",
vjust=0.5, hjust=1.5, size=3, aes( label=round(..y.., digits=1))) +
stat_summary(fun.data = give.n, geom = "text", vjust=1, hjust=-2, size=3) +
coord_flip() +
stat_summary(fun.y = mean, geom="point",colour="darkred", size=2) +
xlab("???")+
ylab("Co-efficient of variation (%)")+
ggtitle("??? vs dailyCV")
p
})
Thank you in advance

Here is an example using the iris dataset and purrr:
library(tidyverse)
data(iris)
## create a grid with variable combinations
variables <- iris %>%
select(everything(), -Species) %>%
names() %>%
expand.grid(x = ., y =., stringsAsFactors = F)
##create plotting function
plot_data <- function(data, x, y){
ggplot(data, aes_string(x, y)) +
geom_point() +
ggtitle(paste(x, "vs", y))
}
map2(.x = variables$x,
.y = variables$y,
.f = ~ plot_data(iris, .x, .y))
This creates all variable combinations of plots and changes the title.

Related

ggplot2 - How to add additional text labels

I am having some trouble with ggplot and stat_summary.
Please consider following data:
head(mtcars)
data<-mtcars
data$hp2<-mtcars$hp+50
Please consider following code:
ggplot(mtcars, aes(x = cyl, y = hp)) +
stat_summary(aes(y = hp, group = 1), fun.y=mean, colour="red", geom="line",group=1) +
stat_summary(fun.y=mean, colour="red", geom="text", show_guide = FALSE, vjust=-0.7, aes( label=round(..y.., digits=0)))
The code will produce line plot with means of hp and text labels for means ans well. If we would like to add another line/curve we simply have to add:
ggplot(mtcars, aes(x = cyl, y = hp)) +
stat_summary(aes(y = hp, group = 1), fun.y=mean, colour="red", geom="line",group=1) +
stat_summary(fun.y=mean, colour="red", geom="text", show_guide = FALSE, vjust=-0.7, aes( label=round(..y.., digits=0)))+
stat_summary(aes(y = hp2), fun.y=mean, colour="blue", geom="line",group=1)
Now comes the tricky part:
How to use stat_summary with geom="text" but for the hp2 i.e. how to technically force stat_summary to calculate means on hp2 and print the text labels? It seems that I can only use it for the "main" y.
This type of problem, that asks for graphs of related vector columns, is almost always a wide-to-long data format reshaping problem.
library(ggplot2)
data_long <- reshape2::melt(data[c('cyl', 'hp', 'hp2')], id.vars = 'cyl')
head(data_long)
ggplot(data_long, aes(x = cyl, y = value, colour = variable)) +
stat_summary(fun.y = mean, geom = "line", show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "text", show.legend = FALSE, vjust=-0.7, aes( label=round(..y.., digits=0))) +
scale_color_manual(values = c("red", "blue"))

Make overlapping histogram in with geom_histogram

I am trying to make an overlapping histogram like this:
ggplot(histogram, aes = (x), mapping = aes(x = value)) +
geom_histogram(data = melt(tpm_18_L_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
geom_histogram(data = melt(tpm_18_S_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
geom_histogram(data = melt(tpm_18_N_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
My code can only make them plot side by side and I would like to also make them overlap. Thank you! I based mine off of the original post where this came from but it did not work for me. It was originally 3 separate graphs which I combined with grid and ggarrange. It looks like this right now.
Here is the code of the three separate graphs.
SD_18_L <- ggplot(data = melt(tpm_18_L_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
SD_18_S <- ggplot(data = melt(tpm_18_S_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
SD_18_N <- ggplot(data = melt(tpm_18_N_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
What my graphs look like now:
ggplot expects dataframes in a long format. I'm not sure what your data looks like, but you shouldn't have to call geom_histogram for each category. Instead, get all your data into a single dataframe (you can use rbind for this) in long format (what you're doing already with melt) first, then feed it into ggplot and map fill to whatever your categorical variable is.
Your call to facet_wrap is what puts them in 3 different plots. If you want them all on the same plot, take that line out.
An example using the iris data:
ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
geom_histogram(alpha = 0.6, position = "identity")
I decreased alpha in geom_histogram so you can see where colors overlap, and added position = "identity" so observations aren't being stacked. Hope that helps!

how to combine 2 DVs in one graph using ggplot

I am trying to combine 2 dependent variables (or 2 graphs) in one graph using ggplot function. All the suggestions I could find online were not really helpful in my case.
Graph1 <- ggplot(mydata, aes(age, conf))
Graph1 + stat_summary(fun.y = mean, geom = "point") +
stat_summary(fun.y = mean, geom = "line", aes(group = 1)) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
labs(x = "Age Group", y = "Accuracy (%)") + ylim(0, 1)
Graph2 <- ggplot(mydata, aes(age, acc))
Graph2 + stat_summary(fun.y = percent(1), geom = "point") +
stat_summary(fun.y = mean, geom = "line", aes(group = 1), linetype = "dashed") +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
labs(x = "Age Group", y = "Accuracy (%)") + ylim(0, 1)
In addition to this, I will need to have the means and error bars not overlapping. Any advice would be greatly appreciated.
After further investigation I have found the following suggestion which seems to be a great solution. However, I cannot install tidyr as it incompatible with the current R version. I have tried different options to download the package, without success.
library(tidyr)
home.land.byyear <- gather(housing.byyear, value = "value", key = "type",
Home.Value, Land.Value)
ggplot(home.land.byyear, aes(x=Date, y=value, color=type)) + geom_line()
see http://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html

full text label on Boxplot, with added mean point

Am trying to get text label similar to what this https://stats.stackexchange.com/questions/8206/labeling-boxplots-in-r, but I cant get it to work. MWE similar to what I have is this:
data <- data.frame(replicate(5,sample(0:100,100,rep=TRUE)))
meanFunction <- function(x){
return(data.frame(y=round(mean(x),2),label=round(mean(x,na.rm=T),2)))}
ggplot(melt(data), aes(x=variable, y=value)) +
geom_boxplot(aes(fill=variable), width = 0.7) +
stat_summary(fun.y = mean, geom="point",colour="darkred", size=4) +
stat_summary(fun.data = meanFunction, geom="text", size = 4, vjust=1.3)
That produces something like "A" in the attached image, and I am trying to get something like "B" for each of the boxes. Thanks.
Here is my attempt. First, I reshaped your data. Then, I produced your boxplot. I changed the size and colour of text for mean. Then, I looked into the data that ggplot used, which you can access using ggplot_build(objectname)$data[[1]]. You can see the numbers you need. I selected necessary variables and reshaped the data, which is df. Using df, you can annotate the numbers you want.
library(dplyr)
library(tidyr)
library(ggplot2)
set.seed(123)
mydf <- data.frame(replicate(5,sample(0:100,100,rep=TRUE)))
mydf <- gather(mydf, variable, value)
meanFunction <- function(x){
return(data.frame(y=round(mean(x),2),label=round(mean(x,na.rm=T),2)))}
g <- ggplot(data = mydf, aes(x = variable, y = value, fill = variable)) +
geom_boxplot(width = 0.5) +
stat_summary(fun.y = mean, geom = "point",colour = "darkred", size=4) +
stat_summary(fun.data = meanFunction, geom ="text", color = "white", size = 3, vjust = 1.3)
df <- ggplot_build(g)$data[[1]] %>%
select(ymin:ymax, x) %>%
gather(type, value, - x) %>%
arrange(x)
g + annotate("text", x = df$x + 0.4, y = df$value, label = df$value, size = 3)
First, I would take your data and then calculate all the boxplot features yourself. Here's one way to do that
dd <- data.frame(replicate(5,sample(0:100,100,rep=TRUE)))
tt <- data.frame(t(sapply(dd, function(x) c(boxplot.stats(x)$stats, mean(x)))))
names(tt) <- c("ymin","lower","middle","upper","ymax", "mean")
tt$var <- factor(rownames(tt))
I'm sure there are prettier ways to do that with dplyr but this point is you'll need to calculate those values yourself so you know where to draw the labels. Then you can do
ggplot(tt) +
geom_boxplot(aes(x=var, ymin=ymin, lower=lower, middle=middle, upper=upper, ymax=ymax), stat="identity", width=.5) +
geom_text(aes(x=as.numeric(var)+.3, y=middle, label=formatC(middle,1, format="f")), hjust=0) +
geom_text(aes(x=as.numeric(var)+.3, y= lower, label=formatC(lower,1, format="f")), hjust=0) +
geom_text(aes(x=as.numeric(var)+.3, y= upper, label=formatC(upper,1, format="f")), hjust=0) +
geom_text(aes(x=as.numeric(var)+.3, y= ymax, label=formatC(ymax,1, format="f")), hjust=0) +
geom_text(aes(x=as.numeric(var)+.3, y= ymin, label=formatC(ymin,1, format="f")), hjust=0) +
geom_point(aes(x=var, y=mean)) +
geom_text(aes(x=as.numeric(var), y= mean, label=formatC(mean,1, format="f")), hjust=.5, vjust=1.5)
to draw each of the labels

to show mean value in ggplot box plot

I need to be able to show the mean value in ggplot box plot. Below works for a point but I need the white dashed lines? Any body help?
x
Team Value
A 10
B 5
C 29
D 35
ggplot(aes(x = Team , y = Value), data = x)
+ geom_boxplot (aes(fill=Team), alpha=.25, width=0.5, position = position_dodge(width = .9))
+ stat_summary(fun.y=mean, colour="red", geom="point")
Here's my way of adding mean to boxplots:
ggplot(RQA, aes(x = Type, y = engagementPercent)) +
geom_boxplot(aes(fill = Type),alpha = .6,size = 1) +
scale_fill_brewer(palette = "Set2") +
stat_summary(fun.y = "mean", geom = "text", label="----", size= 10, color= "white") +
ggtitle("Participation distribution by type") +
theme(axis.title.y=element_blank()) + theme(axis.title.x=element_blank())
ggplot(df, aes(x = Type, y = scorepercent)) +
geom_boxplot(aes(fill = Type),alpha = .6,size = 1) +
scale_fill_brewer(palette = "Set2") +
stat_summary(fun.y = "mean", geom = "point", shape= 23, size= 3, fill= "white") +
ggtitle("score distribution by type") +
theme(axis.title.y=element_blank()) + theme(axis.title.x=element_blank())
I would caution against using text to this and do geom_line instead as text is offset slightly and gives the wrong portrayal of the mean.
Hey user1471980, I think people are more inclined to help if you have a unique user name but then again you have a lot of points :)
this is a hack but does this help:
Value<-c(1,2,3,4,5,6)
Team<-c("a","a","a","b","b","b")
x<-data.frame(Team,Value) #note means for a=2, mean for b=5
ggplot(aes(x = Team , y = Value), data = x) + geom_boxplot (aes(fill=Team), alpha=.25, width=0.5, position = position_dodge(width = .9)) +
annotate(geom="text", x=1, y=2, label="----", colour="white", size=7, fontface="bold", angle=0) +
annotate(geom="text", x=2, y=5, label="----", colour="white", size=7, fontface="bold", angle=0)

Resources