Plot Grouped bar graph with calculated standard deviation in ggplot - r

I feel like this should be really easy to do, but I'm having a really hard time figuring this out.
I have a data frame
type <- c("a","b","c","d","e")
x <- rnorm(5)
y <- rnorm(5)
z <- rnorm(5)
xsd <- sd(x)
ysd <- sd(y)
zsd <- sd(z)
df <- data.frame(type, x,y,z,xsd,ysd,zsd)
df
type x y z xsd ysd zsd
1 a -1.16788106 0.2260430 -1.16788106 0.8182508 0.7321015 0.9016335
2 b -0.09955193 -0.6647980 -0.09955193 0.8182508 0.7321015 0.9016335
3 c -0.87901053 -0.4269936 -0.87901053 0.8182508 0.7321015 0.9016335
4 d -0.87861339 -1.3669793 -0.87861339 0.8182508 0.7321015 0.9016335
5 e 0.84350228 0.4702580 0.84350228 0.8182508 0.7321015 0.9016335
and I need a grouped bar graph of the mean of x, y, and z by type with error bars showing the standard deviation for each variable. The standard deviation is in different columns xsd,ysdand zsd
I need to plot the mean in the y axis, type grouping the x, y, z variables in the x axis.
I tried using gather(), to rearrange the data, but I'm not having any success...

Let ggplot2 do the calculations for you:
install.packages("hmisc") # for mean_sdl
library(tidyverse)
type <- c("a","b","c","d","e")
x <- rnorm(5, 10, 5)
y <- rnorm(5, 8, 3)
z <- rnorm(5, 2, 4)
df <- data.frame(type,x,y,z)
df_long <- df %>%
gather(variable, value, x:z)
ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
stat_summary(fun.y = "mean", geom = "col") +
stat_summary(fun.data = mean_sdl, geom = "errorbar", width = .5, fun.args = list(mult = 1))

This example should help:
type <- c("a","b","c","d","e")
x <- rnorm(50,20, 5)
y <- rnorm(50, 25,1)
z <- rnorm(50, 40, 1)
df <- data.frame(type, x,y,z)
df
library(tidyverse)
df %>%
gather(x,value,-type) %>%
group_by(type, x) %>%
summarise(MEAN = mean(value),
SD = sd(value)) %>%
ggplot(aes(x, MEAN, fill=type))+
geom_bar(stat="identity", position = "dodge")+
geom_errorbar(aes(ymin=MEAN-SD, ymax=MEAN+SD), position = "dodge")

Related

How do I randomly classify my coordinate data in R

I have written a code that generates x and y data and am able to plot it.
# Number of observations
n <- 250
# x randomly drawn from a continuous uniform distribution with bounds [0,10]
x <- runif(min = 0, max = 1, n = sample(n))
# Error term from Normal distribution
error <- rnorm(n = n, mean = 0, sd = 2)
beta_0 <- 1
beta_1 <- -1
y <- beta_0*x + (beta_1*x - error)
library(tibble)
df <- tibble(x = x, y = y)
df
library(ggplot2)
ggplot(data = df, aes(x = x, y = y)) + geom_point()
labs(title = "y = f(x)")
I get an graph image like this:
I also get a data table like this of different coordinate data:
x
y.
0.139
-2.87
0.981
1.48
I would like to now randomly classify my data, such that my table looks like:
x
y.
Group1
Group2
0.139
-2.87
-1
1
0.981
1.48
1
-1
Where 1 represents that points membership to the group and -1 representing the point not being affiliated to the group. On the graph this would look like I had blue dots for Group1 membership vs red dots for Group2 membership.
Any help with this would be greatly appreciated.
Thank you.
To do it the way you suggested (with one column for group 1 and one column for group 2), you could do:
library(dplyr)
library(ggplot2)
df %>%
mutate(group1 = sample(c(-1, 1), n, TRUE),
group2 = -group1) %>%
ggplot(aes(x = x, y = y, color = factor(group1))) +
geom_point() +
scale_color_brewer('group', palette = 'Set1',
labels = c('Group 1', 'Group 2')) +
labs(title = "y = f(x)")
However, it seems a bit redundant to me having two mutually exclusive binary columns. You could just have a single column called group which is either group 1 or group 2:
df %>%
mutate(group = sample(c('Group 1', 'Group 2'), n, TRUE)) %>%
ggplot(aes(x = x, y = y, color = group)) +
geom_point() +
scale_color_brewer(palette = 'Set1') +
labs(title = "y = f(x)"

ggplot - use data passed to ggplot to calculate the mean of the data in subsequent geom calls [duplicate]

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

data column not recognized in the ggplot geom_hline

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

draw vertical lines in ggplot with faceting

I have line plots y vs x. y is sigmoid and varies from 0 to 1.
determine the value of x where y = 0.5 or very close by interpolation.
draw vertical line at x where y = 0.5
library(tidyverse)
# continuous variables
x <- seq(-5, 5, 0.1)
# compute y1
error_term <- runif(1, min = -2, max = 2)
y1 <- 1/(1 + exp(-x + error_term))
# compute y2
error_term <- runif(1, min = -2, max = 2)
y2 <- 1/(1 + exp(-x + error_term))
# merge y
y <- c(y1, y2)
x <- c(x, x)
# categorical variable
a <- c(rep(0, 101), rep(1, 101))
tbl <- tibble(x, a, y)
# TASK
# 1. determine values of x at which y = 0.5 for all categories and store them in variable x0
# 2. Use x0 to draw vertical lines in plots at x where y is 0.5
# ggplot
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
facet_grid(a ~ .)
This really isn't something built in to ggplot so you'll need to summarize the data yourself prior to plotting. You can write a helper function and then create the data you need for the lines
find_intersect <- function(x,y, target=0.5) {
optimize(function(z) (approxfun(x,y)(z)-target)^2, x)$minimum
}
line_data <- tbl %>%
group_by(a) %>%
summarize(xint=find_intersect(x,y))
Then plot with
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
geom_vline(aes(xintercept=xint), data=line_data) +
facet_grid(a ~ .)

ggplot2: How to get geom_text() to play nice with facet_grid()?

So I'm trying to plot a couple of curves using ggplot(), and I would like to have each curve sitting in its own plot in a facet_grid. All of this works fine.
The problem is that I'd also like to annotate the curve with the x value corresponding to the peak y value. I tried using geom_text(), and I tried implementing it as shown below, but it doesn't seem to quite work. It's clearly printing something onto the plot, but not the way I hoped it would; i.e., each plot has its corresponding x value printed on it at the location (x, max(y)).
I suspect I've not implemented the ifelse() correctly, but I'm not experienced enough with R to figure out what exactly the problem is.
Any suggestions on where I'm going wrong?
Output:
Data + code:
library('ggplot2')
x <- seq(5, 15, length=1000)
y <- dnorm(x, mean=10, sd=1)
z <- rep_len("z", length.out = 1000)
x1 <- seq(5, 15, length=1000)
y1 <- dnorm(x1, mean=10, sd=2)
z1 <- rep_len("z1", length.out = 1000)
x <- c(x, x1)
y <- c(y, y1)
z <- c(z, z1)
df <- data.frame(x, y, z)
ggplot(data = df, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(data = df, aes(x, y, label = ifelse(y == max(y), as.numeric(x), '')), inherit.aes = FALSE, hjust = 0, vjust = 0)
Edit: the output I'm expecting is something like this:
You need to fix two things.
(1) calculate max per z
(2) avoid duplicate y_values
The following code should fix both:
library(dplyr)
df2 <- df %>%
distinct(y, .keep_all = TRUE) %>%
group_by(z) %>%
mutate(y_label = ifelse(y == max(y), as.numeric(x), ''))
as.data.frame(df2)
ggplot(data = df2, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(aes(label = y_label), hjust = 0, vjust = 0)
You need to provide geom_text a data.frame with data for z and z1.
x y z
z 9.994995 0.3989373 z
z1 9.994995 0.1994705 z1
How to get that? Well, here's one way.
df.split <- split(df, f = df$z)
df.max <- sapply(df.split, FUN = function(x) which.max(x$y))
df.max <- mapply(function(x1, x2) x1[x2, ], x1 = df.split, x2 = df.max, SIMPLIFY = FALSE)
df.max <- do.call(rbind, df.max)
which you can then plot
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = df.max, aes(x = x, y = y, label = round(y, 2))) +
facet_grid(. ~ z)
Get the means and maxes for each z:
Ys <- df %>% group_by(z) %>% summarise(maxY = max(y))
Xs <- df %>% group_by(z) %>% summarise(meanX = mean(x))
Plot with the geom_text
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = left_join(Xs,Ys), aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)
Or more succinctly
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data =
df %>%
group_by(z) %>%
summarise(maxY = max(y), meanX = mean(x)),
aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)

Resources