R pivot_longer and ggplot errorbar with two name/key columns - r

Let's assume we have the following artifical data:
df <- data.frame(Year = c(2015,2016,2017,2018),
GPP_mean = c(1700,1800,1750,1850),
Reco_mean = c(-1700,-1800,-1750,-1850),
GPP_min = c(1600,1700,1650,1750),
GPP_max = c(1800,1900,1850,1950),
Reco_min = c(-1600,-1700,-1650,-1750),
Reco_max = c(-1800,-1900,-1850,-1950))
I'd like to plot bars for each mean value and use the min/max columns for the errorbar.
This is what I've achieved so far:
df %>%
pivot_longer(cols = -Year,
names_to = c("variable", "stats"),
names_sep = "_")
Which gives us:
# A tibble: 24 x 4
Year variable stats value
<dbl> <chr> <chr> <dbl>
1 2015 GPP mean 1700
2 2015 Reco mean -1700
3 2015 GPP min 1600
4 2015 GPP max 1800
5 2015 Reco min -1600
6 2015 Reco max -1800
7 2016 GPP mean 1800
8 2016 Reco mean -1800
9 2016 GPP min 1700
10 2016 GPP max 1900
# … with 14 more rows
So far, so good (I guess?).
From here on, I have no clue of how I can tell ggplot to plot the mean values as the bars and use min/max for the errorbars. Any help appreciated, thanks.

additional solution using tidyverse
library(tidyverse)
out <- df %>%
pivot_longer(-Year, names_sep = "_", names_to = c("index", ".value"))
ggplot(out, aes(Year, mean, fill = index)) +
geom_col() +
geom_errorbar(aes(ymin = min, ymax = max), width = 0.5)

You should stick with your original data frame. There's no need to pivot longer for this:
ggplot(df, aes(Year, GPP_mean)) +
geom_col(fill = "forestgreen", colour = "black") +
geom_errorbar(aes(ymin = GPP_min, ymax = GPP_max), width = 0.5) +
geom_col(aes(y = Reco_mean), fill = "red", colour = "black", position = "dodge") +
geom_errorbar(aes(ymin = Reco_max, ymax = Reco_min), width = 0.5)

Related

Plot the means of multiple columns

I want to show different barplots for the years and gender with the mean values of the variables Q1 to Q5, which should look like a density.
I have data that looks like this:
data <- data.frame(userid = c(1,1,1,2,2,2,3,3,3),
year = c(2013,2014,2015,2013,2014,2015,2013,2014,2015),
gender = c(1,1,1,0,0,0,0,0,0),
Q1 = c(3,2,3,1,0,1,2,1,0),
Q2 = c(4,3,4,2,0,2,1,4,3),
Q3 = c(1,2,1,3,5,4,5,4,5),
Q4 = c(1,2,1,2,4,3,2,2,1),
Q5 = c(1,1,1,2,1,0,0,0,1))
My solution was to filter() for year and gender first and then use summarise(),
to get a vector of the means and put this into the barplot() function:
data %>% filter(gender==1,year==2013) %>% select(-userid,-gender,-year) %>% summarise_all(mean) %>%
as.numeric() %>%
barplot()
Instead of doing this for every combination of year and gender,
is there a more elegant way, using ggplot and facet_wrap()?
I may have misunderstood how you want the plot arranged, but if you want to show the mean score answer per year and gender group, you could do facets like this:
library(tidyverse)
data %>%
pivot_longer(starts_with("Q")) %>%
group_by(year, gender, name) %>%
summarize(value = mean(value)) %>%
ggplot(aes(name, value)) +
geom_col(fill = 'deepskyblue4') +
facet_grid(year ~ gender) +
labs(x = 'Question', y = 'Average score') +
theme_minimal(base_size = 16)
Maybe you want something like this with facet_wrap and geom_col where the mean is calculate using rowMeans like this:
library(dplyr)
library(ggplot2)
data %>%
mutate(mean = rowMeans(select(., starts_with("Q")), na.rm = TRUE)) %>%
ggplot(aes(x = year, y = mean, fill = factor(gender))) +
geom_col() +
labs(x = 'Year', y = 'Mean Q1 to Q5', fill = 'Gender') +
theme_bw() +
facet_wrap(~userid)
Created on 2022-10-28 with reprex v2.0.2
First, pivot your data from wide to long format, group by year, gender, and Q, and summarize to mean values.
library(tidyr)
library(dplyr)
library(ggplot2)
data_long <- data %>%
pivot_longer(Q1:Q5, names_to = "Q", values_to = "value") %>%
group_by(year, gender, Q) %>%
summarize(value = mean(value), .groups = "drop")
data_long
# A tibble: 30 × 4
year gender Q value
<dbl> <dbl> <chr> <dbl>
1 2013 0 Q1 1.5
2 2013 0 Q2 1.5
3 2013 0 Q3 4
4 2013 0 Q4 2
5 2013 0 Q5 1
6 2013 1 Q1 3
7 2013 1 Q2 4
8 2013 1 Q3 1
9 2013 1 Q4 1
10 2013 1 Q5 1
# … with 20 more rows
Then plot using ggplot2::facet_grid().
ggplot(data_long, aes(Q, value)) +
geom_col() +
facet_grid(year ~ gender)
aggregate then barplot.
par(mfrow=c(1, 4))
sapply(unique(data$year), \(x) {
as.matrix(aggregate(cbind(Q1, Q2, Q3, Q4, Q5) ~ gender, data[data$year == x, ], FUN=mean)[-1]) |>
barplot(beside=TRUE, col=c(2, 4), main=x)
})
plot.new()
legend('left', legend=c('m', 'f'), col=c(2, 4), cex=1.2, pch=15, bty='n')
This approach does not require you to first calculate the mean, that is handled by stat_summary(), specifying fun = mean.
library(tidyverse)
data <- data.frame(userid = c(1,1,1,2,2,2,3,3,3),
year = c(2013,2014,2015,2013,2014,2015,2013,2014,2015),
gender = c(1,1,1,0,0,0,0,0,0),
Q1 = c(3,2,3,1,0,1,2,1,0),
Q2 = c(4,3,4,2,0,2,1,4,3),
Q3 = c(1,2,1,3,5,4,5,4,5),
Q4 = c(1,2,1,2,4,3,2,2,1),
Q5 = c(1,1,1,2,1,0,0,0,1))
data %>%
select(starts_with("Q")) %>% # Selects each column that starts with "Q"
pivot_longer(cols = everything()) %>% # Pivot to long format
ggplot(aes(x = name, y = value, fill = name)) +
stat_summary(geom = "bar", fun = "mean") + # Geom and function can be changed easily
theme_classic() +
labs(x = "Q", y = "Mean value", fill = NULL)
Created on 2022-10-28 by the reprex package (v2.0.1)

How to set specific date as the beginning date of the year

I want to plot the average annual value of the stream flow data using
WATER YEAR which starts at October and ends at September (say 10/01/1983 to 09/30/1984, this is defined as 1984 water year)
I tried to find solutions elsewhere but I have failed.
Now I'm using the following script to plot the annual average flow
library(tidyverse)
library(lubridate)
library(ggplot2)
#df <- read_csv('dataframe.csv')
df <- df %>%
mutate(date = mdy(df$date))
df <- df %>%
mutate(year = floor_date(date, "year")) %>%
group_by(year) %>%
summarize(avg = mean(flow))
y <- df$avg
x <- as.Date(df$year, format = "Y")
d <- data.frame(x = x, y = y)
# interpolate values from zero to y and create corresponding number of x values
vals <- lapply(d$y, function(y) seq(0, y, by = 0.1))
y <- unlist(vals)
mid <- rep(d$x, lengths(vals))
d2 <- data.frame(x = mid - 100,
xend = mid + 100,
y = y,
yend = y)
ggplot(data = d2, aes(x = x, xend = xend, y = y, yend = yend, color = y)) +
geom_segment(size = 2) +
scale_color_gradient2(low = "midnightblue", mid = "deepskyblue", high = "aquamarine",
midpoint = max(d2$y)/2)+
scale_x_date(date_breaks = "1 year",date_labels = "%Y", expand = c(0,0)) +
theme(axis.text.x = element_text(angle=90, vjust=.5))+
labs(x = "Years", y = "Mean Annual Flow (cms)")+
ggtitle("Mean Annual Flow, Rancho River at ELdorado (1983-2020)")+
theme(plot.title = element_text(hjust = 0.5))
For this I got the following results using calendar year
If I used Water Year there will be no results for 1983
The data frame can be found in the following link
https://drive.google.com/file/d/11PVub9avzMFhUz02cHfceGh9DrlVQDbD/view?usp=sharing
Kindly assist.
If date is superior to 10/01/year(date) it means that this is the next year (in water years):
df %>%
mutate(date=mdy(date), year=year(date), year = year + (date >= mdy(paste0("10/01/", year))))
# A tibble: 5,058 x 3
date flow year
<date> <dbl> <dbl>
1 1983-10-01 3.31 1984
2 1983-10-02 3.19 1984
3 1983-10-03 3.7 1984
4 1983-10-04 3.83 1984
5 1983-10-05 3.44 1984
6 1983-10-06 4.37 1984
7 1983-10-07 6.78 1984
8 1983-10-08 6.3 1984
9 1983-10-09 6.46 1984
10 1983-10-10 6.62 1984
# … with 5,048 more rows

Dygraphs in R: Plot Ribbon and mean line of different groups

I recently started working dygraphs in R, and wanted to achieve a ribbon line plot with it.
Currently, I have the below ggplot which displays a ribbon (for data from multiple batches over time) and its median for two groups. Below is the code for it.
ggplot(df,
aes(x=variable, y=A, color=`[category]`, fill = `[category]`)) +
stat_summary(geom = "ribbon", alpha = 0.35) +
stat_summary(geom = "line", size = 0.9) +
theme_minimal()+ labs(x="TimeStamp")
I could add the median solid line on the dygraph, but I'm unable to add the ribbon to it. Below is the dygraph and my code for it.
df_Medians<- df%>%
group_by(variable,`[category]`) %>%
summarise(A = median(A[!is.na(A)]))
median <- cbind(as.ts(df_Medians$A))
dygraph(median) %>%
dyRangeSelector()
Is there anyway to plot something similar to the above ggplot on dygraphs? Thanks in advance.
See if the following serves your purpose:
ggplot code (for mean, replace median_se with mean_se in the stat_summary layers):
library(ggplot2)
ggplot(df,
aes(x=variable, y=A, color=category, fill = category)) +
stat_summary(geom = "ribbon", alpha = 0.35, fun.data = median_se) +
stat_summary(geom = "line", size = 0.9, fun.data = median_se) +
theme_minimal()
dygraph code (for mean, replace median_se with mean_se in the summarise step):
library(dplyr)
library(dygraph)
# calculate summary statistics for each category, & spread results out such that each row
# corresponds to one position on the x-axis
df_dygraph <- df %>%
group_by(variable, category) %>%
summarise(data = list(median_se(A))) %>%
ungroup() %>%
tidyr::unnest(data) %>%
mutate(category = as.integer(factor(category))) %>% # optional: standardizes the column
# names for summary stats
tidyr::pivot_wider(id_cols = variable, names_from = category,
values_from = c(ymin, y, ymax))
> head(df_dygraph)
# A tibble: 6 x 7
variable ymin_1 ymin_2 y_1 y_2 ymax_1 ymax_2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 3817. 2712. 4560. 2918. 5304. 3125.
2 2 3848. 2712. 4564. 2918. 5279. 3125.
3 3 3847. 2826. 4564 2961 5281. 3096.
4 4 3722. 2827. 4331 2962. 4940. 3098.
5 5 3833. 2831. 4570. 2963 5306. 3095.
6 6 3835. 2831. 4572 2964 5309. 3097.
dygraph(df_dygraph, main = "Dygraph title") %>%
dySeries(c("ymin_1", "y_1", "ymax_1"), label = "Category 1") %>%
dySeries(c("ymin_2", "y_2", "ymax_2"), label = "Category 2") %>%
dyRangeSelector()
Code for median counterpart of mean_se:
median_se <- function(x) {
x <- na.omit(x)
se <- sqrt(var(x) / length(x))
med <- median(x)
ggplot2:::new_data_frame(list(y = med,
ymin = med - se,
ymax = med + se),
n = 1)
}
Sample data:
df <- diamonds %>%
select(price, cut) %>%
filter(cut %in% c("Fair", "Ideal")) %>%
group_by(cut) %>%
slice(1:1000) %>%
mutate(variable = rep(seq(1, 50), times = 20)) %>%
ungroup() %>%
rename(A = price, category = cut)

R - (ggplot2 library) - Legends not showing on graphs

What I'm doing
I'm using a library for R called ggplot2, which allows for a lot of different options for creating graphics and other things. I'm using that to display two different data sets on one graph with different colours for each set of data I want to display.
The Problem
I'm also trying to get a legend to to show up in my graph that will tell the user which set of data corresponds to which colour. So far, I've not been able to get it to show.
What I've tried
I've set it to have a position at the top/bottom/left/right to make sure nothing was making it's position to none by default, which would've hidden it.
The Code
# PDF/Plot generation
pdf("activity-plot.pdf")
ggplot(data.frame("Time"=times), aes(x=Time)) +
#Data Set 1
geom_density(fill = "#1A3552", colour = "#4271AE", alpha = 0.8) +
geom_text(x=mean(times)-1, y=max(density(times)$y/2), label="Mean {1} Activity", angle=90, size = 4) +
geom_vline(aes(xintercept=mean(times)), color="cyan", linetype="dashed", size=1, alpha = 0.5) +
# Data Set 2
geom_density(data=data.frame("Time"=timesSec), fill = "gray", colour = "orange", alpha = 0.8) +
geom_text(x=mean(timesSec)-1, y=max(density(timesSec)$y/2), label="Mean {2} Activity", angle=90, size = 4) +
geom_vline(aes(xintercept=mean(timesSec)), color="orange", linetype="dashed", size=1, alpha = 0.5) +
# Main Graph Info
labs(title="Activity in the past 48 hours", subtitle="From {DATE 1} to {DATE 2}", caption="{LOCATION}") +
scale_x_continuous(name = "Time of Day", breaks=seq(c(0:23))) +
scale_y_continuous(name = "Activity") +
theme(legend.position="top")
dev.off()
Result
As pointed out by #Ben, you should pass the color into an aes in order to get the legend being displayed.
However, a better way to get a ggplot is to merge your two values "Time" and "Timesec" into a single dataframe and reshape your dataframe into a longer format. Here, to illustrate this, I created this dummy dataframe:
Time = sample(1:24, 200, replace = TRUE)
Timesec = sample(1:24, 200, replace = TRUE)
df <- data.frame(Time, Timesec)
Time Timesec
1 22 23
2 21 9
3 19 9
4 10 6
5 7 24
6 15 9
... ... ...
So, the first step is to reshape your dataframe into a longer format. Here, I'm using pivot_longer function from tidyr package:
library(tidyr)
library(dplyr)
df %>% pivot_longer(everything(), names_to = "var",values_to = "val")
# A tibble: 400 x 2
var val
<chr> <int>
1 Time 22
2 Timesec 23
3 Time 21
4 Timesec 9
5 Time 19
6 Timesec 9
7 Time 10
8 Timesec 6
9 Time 7
10 Timesec 24
# … with 390 more rows
To add geom_vline and geom_text based on the mean of your values, a nice way of doing it easily is to create a second dataframe gathering the mean and the maximal density values needed to be plot:
library(tidyr)
library(dplyr)
df_lab <- df %>% pivot_longer(everything(), names_to = "var",values_to = "val") %>%
group_by(var) %>%
summarise(Mean = mean(val),
Density = max(density(val)$y))
# A tibble: 2 x 3
var Mean Density
<chr> <dbl> <dbl>
1 Time 11.6 0.0555
2 Timesec 12.1 0.0517
So, using df and df_lab, you can generate your entire plot. Here, we passed color and fill arguments into the aes and use scale_color_manual and scale_fill_manual to set appropriate colors:
library(dplyr)
library(tidyr)
library(ggplot2)
df %>% pivot_longer(everything(), names_to = "var",values_to = "val") %>%
ggplot(aes(x = val, fill = var, colour = var))+
geom_density(alpha = 0.8)+
scale_color_manual(values = c("#4271AE", "orange"))+
scale_fill_manual(values = c("#1A3552", "gray"))+
geom_vline(inherit.aes = FALSE, data = df_lab,
aes(xintercept = Mean, color = var), linetype = "dashed", size = 1,
show.legend = FALSE)+
geom_text(inherit.aes = FALSE, data = df_lab,
aes(x = Mean-0.5, y = Density/2, label = var, color = var), angle = 90,
show.legend = FALSE)+
labs(title="Activity in the past 48 hours", subtitle="From {DATE 1} to {DATE 2}", caption="{LOCATION}") +
scale_x_continuous(name = "Time of Day", breaks=seq(c(0:23))) +
scale_y_continuous(name = "Activity") +
theme(legend.position="top")
Does it answer your question ?

Barplot for four variables side by side for each month (January to December)

I am a starter in R and would like to plot a bar chart of my rainfall and solar radiation data of two years side by side from January to December (attached data).
data to plot:
I am trying to plot the first row (January) but I am getting this error
Error in -0.01 * height : non-numeric argument to binary operator
How to deal with that? and and which script to use to get my data plotted?
Regards,
Here is an example
library(tidyverse)
set.seed(123456)
df <- data.frame(Month = month.abb,
R_2014 = runif(n = 12, min = 0, max = 195),
R_2015 = runif(n = 12, min = 0, max = 295),
S_2014 = runif(n = 12, min = 3, max = 10),
S_2015 = runif(n = 12, min = 4, max = 10))
df
#> Month R_2014 R_2015 S_2014 S_2015
#> 1 Jan 155.56794 267.06645 6.344445 9.714178
#> 2 Feb 146.94519 259.85035 7.903533 9.229704
#> 3 Mar 76.29486 293.18178 9.159223 8.272923
#> 4 Apr 66.60356 264.30712 9.144556 7.632427
#> 5 May 70.45235 259.19979 8.977157 5.352593
#> 6 Jun 38.67722 58.29370 4.161913 8.437571
#> 7 Jul 104.29730 98.82311 6.660781 9.373255
#> 8 Aug 18.82262 229.27586 9.083897 5.766779
#> 9 Sep 192.63015 47.08010 4.618097 7.092115
#> 10 Oct 32.67605 23.79035 3.833566 6.607897
#> 11 Nov 155.60788 39.13185 8.767659 7.450991
#> 12 Dec 115.78983 50.71209 3.561939 8.445736
# convert from wide to long format
# separate columns to get variable and year
df_long <- df %>%
gather(key, value, -Month) %>%
separate(key, into = c("variable", "Year"), "_") %>%
mutate(Month = factor(Month, levels = month.abb))
head(df_long)
#> Month variable Year value
#> 1 Jan R 2014 155.56794
#> 2 Feb R 2014 146.94519
#> 3 Mar R 2014 76.29486
#> 4 Apr R 2014 66.60356
#> 5 May R 2014 70.45235
#> 6 Jun R 2014 38.67722
# facet by year
plt1 <- ggplot(df_long, aes(x = Month, y = value, fill = variable)) +
geom_col(position = "dodge") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
facet_wrap(~ Year)
plt1
# facet by variable
plt2 <- ggplot(df_long, aes(x = Month, y = value, fill = Year)) +
geom_col(position = "dodge") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
facet_wrap( ~ variable, scales = "free_y")
plt2
Created on 2018-06-01 by the reprex package (v0.2.0).

Resources