ggplot and dplyr showing standard error - r

I have a plot where I plot multiple lines and for each point I'd like to show the error bars. I need to use
geom_errorbar(aes(ymax=ymax, ymin=ymin), width=0.25) + xlab('points')
My question is how best to make the ymax an ymin columns
Currently the data frame looks like this
data1 <- data.frame(
group=c("A","A","A","A","B","B","B","B"),
x= c(1,2,3,4,5,6,7,8),
y = c(1,2,3,4,5,6,7,8),
z= c(10,20,30,40,50,60,70,80)) # sample data matrix
data2 = as.data.frame (data1 %>% group_by( group ) %>%
summarise(
MU_Y= mean(y),
upper_limit_Y =MU_Y+(1.96*sd(y, na.rm = TRUE)/sqrt(sum(!is.na(y)))),
lower_limit_Y = MU_Y-(1.96*sd(y, na.rm = TRUE)/sqrt(sum(!is.na(y)))),
MU_Z= mean(z),
upper_limit_Z =MU_Z+(1.96*sd(z, na.rm = TRUE)/sqrt(sum(!is.na(z)))),
lower_limit_Z = MU_Z-(1.96*sd(z, na.rm = TRUE)/sqrt(sum(!is.na(z))))
) %>%
gather(key =Metric, value = Value ,
#c(MU_Y,lower_limit_Y,upper_limit_Y,MU_Z, upper_limit_Z,lower_limit_Z) )
c(MU_Y,MU_Z) )
)
group upper_limit_Y lower_limit_Y upper_limit_Z lower_limit_Z Metric Value
1 A 3.765175 1.234825 37.65175 12.34825 MU_Y 2.5
2 B 7.765175 5.234825 77.65175 52.34825 MU_Y 6.5
3 A 3.765175 1.234825 37.65175 12.34825 MU_Z 25.0
4 B 7.765175 5.234825 77.65175 52.34825 MU_Z 65.0
ggplot(data2, aes(x = group, y= Value, group = Metric ))+
geom_line()+
geom_point()
I need to make 2 new columns to the data frame ymin and ymax that are the appropriate upper/lower limits. the new column that are added should look like this (I didnt copy all decimal places):
ymin ymax
1.23.. 3.76..
5.23.. 7.76..
12.34.. 37.65..
52... 77.65...
then I'd be able to plot the lines and the error bars for each point.
ggplot(data2, aes(x = group, y= Value, group = Metric ))+
geom_line()+
geom_point() +
geom_errorbar(aes(ymax=ymax, ymin=ymin), width=0.25) +
xlab('points')

I think you have your gather in the wrong place.
data2 <- data1 %>%
gather(key = Metric, value = Value, -group, -x) %>%
group_by(group, Metric) %>%
summarise(
MU = mean(Value),
SD = sd(Value, na.rm = TRUE),
N = sum(!is.na(Value)),
upper_limit = MU + SD/sqrt(N),
lower_limit = MU - SD/sqrt(N)
)
ggplot(data2, aes(x = group, y= MU, group = Metric ))+
geom_line()+
geom_point() +
geom_errorbar(aes(ymax=upper_limit, ymin=lower_limit), width=0.25) +
xlab('points')
Does this do what you want?

Related

Why isn't my custom errorbar function working in R?

I have tried to make a function to quickly make an error bar based on a grouping factor and a numerical value as defined below:
#### Function ####
quick.error <- function(data,x,y){
d <- data
plot.d <- d %>%
mutate(x = as.factor(x)) %>%
group_by(x) %>%
summarise(
sd = sd(y, na.rm = TRUE),
mean = mean(y, na.rm=TRUE)
) %>%
ggplot(aes(x,
mean,
fill=x)) +
geom_col(color = "black") +
geom_errorbar(aes(ymin = mean-sd,
ymax = mean+sd),
width = 0.2) +
theme(legend.position = "none")
return(plot.d)
}
However, when I try to run this with the iris dataset:
#### Test ####
quick.error(data=iris,
x=Species,
y=Petal.Length)
This gives me an error:
Error in `mutate()`:
! Problem while computing `x = as.factor(x)`.
Caused by error in `is.factor()`:
! object 'Species' not found
Running it explicitly with $ operators gives me a different issue:
#### Test ####
quick.error(data=iris,
x=iris$Species,
y=iris$Petal.Length)
As you can see here, it has made all the bars the same, I assume because it did not group the mean like it was supposed to:
How do I fix this problem?
As I indicate in my comment, this is a typical non-standard evaluation problem. Here's a revised function that I believe gives you what you want.
quick.error <- function(data,x,y){
d <- data
plot.d <- d %>%
mutate({{ x }} := as.factor({{ x }})) %>%
group_by({{ x }}) %>%
summarise(
sd = sd({{ y }}, na.rm = TRUE),
mean = mean({{ y }}, na.rm=TRUE)
) %>%
ggplot(aes({{ x }},
mean,
fill={{ x }})) +
geom_col(color = "black") +
geom_errorbar(aes(ymin = mean-sd,
ymax = mean+sd),
width = 0.2) +
theme(legend.position = "none")
return(plot.d)
}
quick.error(data=iris,
x=Species,
y=Petal.Length)
Passing unquoted column names to a function
... requires injection with the embracing operator {{ or, in more complex cases, the injection operator !!.
For more on that see e.g. this vignette.
Hence you could make your function work by wrapping x and y inside your function in {{:
quick.error <- function(data, x, y) {
d <- data
plot.d <- d %>%
mutate(x = as.factor({{ x }})) %>%
group_by(x) %>%
summarise(
sd = sd({{ y }}, na.rm = TRUE),
mean = mean({{ y }}, na.rm = TRUE)
) %>%
ggplot(aes(x,
mean,
fill = x
)) +
geom_col(color = "black") +
geom_errorbar(aes(
ymin = mean - sd,
ymax = mean + sd
),
width = 0.2
) +
theme(legend.position = "none")
return(plot.d)
}
library(ggplot2)
library(dplyr)
quick.error(
data = iris,
x = Species,
y = Petal.Length
)

In ggplot how do I plot the mean line for two groups in a scatterplot

I would like to show the mean of two groups in a scatterplot. I have sorted the data so the groups are next to each other. Group 1 is the first 11 records and group2 is the next 133. How can I tell ggplot to draw one line across the range for the first group (House 1-11) and a second line for the second (House 12-133).
Here is what I have so far:
And the code is here:
library(tidyverse)
library(tidymodels)
data(ames)
ames <- AmesHousing::make_ames()
set.seed(1)
split <- initial_split(ames, prop = 0.95, strata = "Sale_Price")
ames_plot <- testing(split)
model1 <- lm(Sale_Price ~ Central_Air, data = ames_plot)
p1 <- model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha = 0.3) +
geom_segment(aes(x = 1, y = .fitted, xend = 144, yend =.fitted)) +
scale_y_continuous(labels = scales::dollar)
p1
Using geom_smooth(formula = 'y ~ x', se = FALSE, method = "lm") instead of geom_segment() gets me close to what I want but I want to show the actual predicted values coming form the lm().
It would be best just to summarize your data for that layer. For example
model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha=.3) +
geom_segment(aes(x = first, y = .fitted, xend = last, yend =.fitted),
data = function(x) {
x %>%
group_by(Central_Air) %>%
summarize(first=first(House), last=last(House), .fitted=mean(.fitted), .groups="drop_last")
}) +
scale_y_continuous(labels = scales::dollar)

Dygraphs in R: Plot Ribbon and mean line of different groups

I recently started working dygraphs in R, and wanted to achieve a ribbon line plot with it.
Currently, I have the below ggplot which displays a ribbon (for data from multiple batches over time) and its median for two groups. Below is the code for it.
ggplot(df,
aes(x=variable, y=A, color=`[category]`, fill = `[category]`)) +
stat_summary(geom = "ribbon", alpha = 0.35) +
stat_summary(geom = "line", size = 0.9) +
theme_minimal()+ labs(x="TimeStamp")
I could add the median solid line on the dygraph, but I'm unable to add the ribbon to it. Below is the dygraph and my code for it.
df_Medians<- df%>%
group_by(variable,`[category]`) %>%
summarise(A = median(A[!is.na(A)]))
median <- cbind(as.ts(df_Medians$A))
dygraph(median) %>%
dyRangeSelector()
Is there anyway to plot something similar to the above ggplot on dygraphs? Thanks in advance.
See if the following serves your purpose:
ggplot code (for mean, replace median_se with mean_se in the stat_summary layers):
library(ggplot2)
ggplot(df,
aes(x=variable, y=A, color=category, fill = category)) +
stat_summary(geom = "ribbon", alpha = 0.35, fun.data = median_se) +
stat_summary(geom = "line", size = 0.9, fun.data = median_se) +
theme_minimal()
dygraph code (for mean, replace median_se with mean_se in the summarise step):
library(dplyr)
library(dygraph)
# calculate summary statistics for each category, & spread results out such that each row
# corresponds to one position on the x-axis
df_dygraph <- df %>%
group_by(variable, category) %>%
summarise(data = list(median_se(A))) %>%
ungroup() %>%
tidyr::unnest(data) %>%
mutate(category = as.integer(factor(category))) %>% # optional: standardizes the column
# names for summary stats
tidyr::pivot_wider(id_cols = variable, names_from = category,
values_from = c(ymin, y, ymax))
> head(df_dygraph)
# A tibble: 6 x 7
variable ymin_1 ymin_2 y_1 y_2 ymax_1 ymax_2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 3817. 2712. 4560. 2918. 5304. 3125.
2 2 3848. 2712. 4564. 2918. 5279. 3125.
3 3 3847. 2826. 4564 2961 5281. 3096.
4 4 3722. 2827. 4331 2962. 4940. 3098.
5 5 3833. 2831. 4570. 2963 5306. 3095.
6 6 3835. 2831. 4572 2964 5309. 3097.
dygraph(df_dygraph, main = "Dygraph title") %>%
dySeries(c("ymin_1", "y_1", "ymax_1"), label = "Category 1") %>%
dySeries(c("ymin_2", "y_2", "ymax_2"), label = "Category 2") %>%
dyRangeSelector()
Code for median counterpart of mean_se:
median_se <- function(x) {
x <- na.omit(x)
se <- sqrt(var(x) / length(x))
med <- median(x)
ggplot2:::new_data_frame(list(y = med,
ymin = med - se,
ymax = med + se),
n = 1)
}
Sample data:
df <- diamonds %>%
select(price, cut) %>%
filter(cut %in% c("Fair", "Ideal")) %>%
group_by(cut) %>%
slice(1:1000) %>%
mutate(variable = rep(seq(1, 50), times = 20)) %>%
ungroup() %>%
rename(A = price, category = cut)

show 2 standard deviations on a ggplot2 control chart (in addition to the normal 3)

First I create the data:
library(ggplot2)
library(ggQC)
set.seed(5555)
Golden_Egg_df <- data.frame(month=1:12, egg_diameter = rnorm(n = 12, mean = 1.5, sd = 0.2))
Then I setup the base ggplot.
XmR_Plot <- ggplot(Golden_Egg_df, aes(x = month, y = egg_diameter)) +
geom_point() + geom_line()
I can create a simple control chart with the ggQC package, in the following manner.
XmR_Plot + stat_QC(method = "XmR")
I can facet the control chart to show different levels of standard deviation (in this example, between 1-3).
XmR_Plot + stat_qc_violations(method = "XmR")
What I want is to be able to see both 2 and 3 standard deviations on the same chart, not faceted. My imagined syntax would be
XmR_Plot + stat_QC(method = "XmR", stand.dev = c(2, 3))
or something like that. But it obviously does not work, how do I get multiple standard deviations to show on 1 chart? It'd look something like this:
[
I highly recommend calculating your summary statistics yourself. You'll get a lot more control over the plot!
library(ggplot2)
library(dplyr)
library(tidyr)
set.seed(5555)
golden.egg.df = data.frame(month=1:12,
egg_diameter = rnorm(n = 12,
mean = 1.5,
sd = 0.2)
)
lines.df = golden.egg.df %>%
# Calculate all the summary stats
mutate(mean = mean(egg_diameter),
sd = sd(egg_diameter),
plus_one = mean + sd,
plus_two = mean + 2 * sd,
plus_three = mean + 3 * sd,
minus_one = mean - sd,
minus_two = mean - 2 * sd,
minus_three = mean - 3 * sd
) %>%
# Remove what we don't want to plot
select(-month, -egg_diameter, -sd) %>%
# Filter so the dataframe is now one unique row
unique() %>%
# Make the table tall for plotting
gather(key = stat,
value = value) %>%
# Add a new column which indicates how many SDs a line is from
# the mean
mutate(linetype = gsub("[\\s\\S]+?_", "", stat, perl = TRUE))
ggplot(golden.egg.df,
aes(x = month, y = egg_diameter)) +
geom_hline(data = lines.df,
aes(yintercept = value, linetype = linetype)) +
geom_point() +
geom_line()

Changing line color in ggplot based on "several factors" slope

UPDATED:
I have the following data which I would like to draw a line between the groups, based on the slope of 3 factors `("I","II","III").
set.seed(205)
dat = data.frame(t=rep(c("I","II","III"), each=10),
pairs=rep(1:10,3),
value=rnorm(30),
group=rep(c("A","B"), 15))
I have tried the following, but I cannot manage to connect change the color of the line connecting "I" - "III" and "II" - "III":
ggplot(dat %>% group_by(pairs) %>%
mutate(slope = (value[t=="II"] - value[t=="I"])/( value[t=="II"])- value[t=="I"]),
aes(t, value, group=pairs, linetype=group, colour=slope > 0)) +
geom_point() +
geom_line()
This is a very similar issue to
Changing line color in ggplot based on slope
I hope I was able to explain my problem.
We can split apart the data, and get what you want:
#calculate slopes for I and II
dat %>%
filter(t != "III") %>%
group_by(pairs) %>%
# use diff to calculate slope
mutate(slope = diff(value)) -> dat12
#calculate slopes for II and III
dat %>%
filter(t != "I") %>%
group_by(pairs) %>%
# use diff to calculate slope
mutate(slope = diff(value)) -> dat23
ggplot()+
geom_line(data = dat12, aes(x = t, y = value, group = pairs, colour = slope > 0,
linetype = group))+
geom_line(data = dat23, aes(x = t, y = value, group = pairs, colour = slope > 0,
linetype = group))+
theme_bw()
Since the data in dat came sorted by t, I used diff to calculate the slope.

Resources