Plot the means of multiple columns - r

I want to show different barplots for the years and gender with the mean values of the variables Q1 to Q5, which should look like a density.
I have data that looks like this:
data <- data.frame(userid = c(1,1,1,2,2,2,3,3,3),
year = c(2013,2014,2015,2013,2014,2015,2013,2014,2015),
gender = c(1,1,1,0,0,0,0,0,0),
Q1 = c(3,2,3,1,0,1,2,1,0),
Q2 = c(4,3,4,2,0,2,1,4,3),
Q3 = c(1,2,1,3,5,4,5,4,5),
Q4 = c(1,2,1,2,4,3,2,2,1),
Q5 = c(1,1,1,2,1,0,0,0,1))
My solution was to filter() for year and gender first and then use summarise(),
to get a vector of the means and put this into the barplot() function:
data %>% filter(gender==1,year==2013) %>% select(-userid,-gender,-year) %>% summarise_all(mean) %>%
as.numeric() %>%
barplot()
Instead of doing this for every combination of year and gender,
is there a more elegant way, using ggplot and facet_wrap()?

I may have misunderstood how you want the plot arranged, but if you want to show the mean score answer per year and gender group, you could do facets like this:
library(tidyverse)
data %>%
pivot_longer(starts_with("Q")) %>%
group_by(year, gender, name) %>%
summarize(value = mean(value)) %>%
ggplot(aes(name, value)) +
geom_col(fill = 'deepskyblue4') +
facet_grid(year ~ gender) +
labs(x = 'Question', y = 'Average score') +
theme_minimal(base_size = 16)

Maybe you want something like this with facet_wrap and geom_col where the mean is calculate using rowMeans like this:
library(dplyr)
library(ggplot2)
data %>%
mutate(mean = rowMeans(select(., starts_with("Q")), na.rm = TRUE)) %>%
ggplot(aes(x = year, y = mean, fill = factor(gender))) +
geom_col() +
labs(x = 'Year', y = 'Mean Q1 to Q5', fill = 'Gender') +
theme_bw() +
facet_wrap(~userid)
Created on 2022-10-28 with reprex v2.0.2

First, pivot your data from wide to long format, group by year, gender, and Q, and summarize to mean values.
library(tidyr)
library(dplyr)
library(ggplot2)
data_long <- data %>%
pivot_longer(Q1:Q5, names_to = "Q", values_to = "value") %>%
group_by(year, gender, Q) %>%
summarize(value = mean(value), .groups = "drop")
data_long
# A tibble: 30 × 4
year gender Q value
<dbl> <dbl> <chr> <dbl>
1 2013 0 Q1 1.5
2 2013 0 Q2 1.5
3 2013 0 Q3 4
4 2013 0 Q4 2
5 2013 0 Q5 1
6 2013 1 Q1 3
7 2013 1 Q2 4
8 2013 1 Q3 1
9 2013 1 Q4 1
10 2013 1 Q5 1
# … with 20 more rows
Then plot using ggplot2::facet_grid().
ggplot(data_long, aes(Q, value)) +
geom_col() +
facet_grid(year ~ gender)

aggregate then barplot.
par(mfrow=c(1, 4))
sapply(unique(data$year), \(x) {
as.matrix(aggregate(cbind(Q1, Q2, Q3, Q4, Q5) ~ gender, data[data$year == x, ], FUN=mean)[-1]) |>
barplot(beside=TRUE, col=c(2, 4), main=x)
})
plot.new()
legend('left', legend=c('m', 'f'), col=c(2, 4), cex=1.2, pch=15, bty='n')

This approach does not require you to first calculate the mean, that is handled by stat_summary(), specifying fun = mean.
library(tidyverse)
data <- data.frame(userid = c(1,1,1,2,2,2,3,3,3),
year = c(2013,2014,2015,2013,2014,2015,2013,2014,2015),
gender = c(1,1,1,0,0,0,0,0,0),
Q1 = c(3,2,3,1,0,1,2,1,0),
Q2 = c(4,3,4,2,0,2,1,4,3),
Q3 = c(1,2,1,3,5,4,5,4,5),
Q4 = c(1,2,1,2,4,3,2,2,1),
Q5 = c(1,1,1,2,1,0,0,0,1))
data %>%
select(starts_with("Q")) %>% # Selects each column that starts with "Q"
pivot_longer(cols = everything()) %>% # Pivot to long format
ggplot(aes(x = name, y = value, fill = name)) +
stat_summary(geom = "bar", fun = "mean") + # Geom and function can be changed easily
theme_classic() +
labs(x = "Q", y = "Mean value", fill = NULL)
Created on 2022-10-28 by the reprex package (v2.0.1)

Related

Working with ggalluvial ggsankey library with missing combinations and dropouts

I'm trying to represent the movements of patients between several treatment groups measured in 3 different years. However, there're dropouts where some patients from 1st year are missing in the 2nd year or there are patients in the 2nd year who weren't in the 1st. Same for 3rd year. I have a label called "none" for these combinations, but I don't want it to be in the plot.
An example plot with only 2 years:
EDIT
I have tried with geom_sankey as well (https://rdrr.io/github/davidsjoberg/ggsankey/man/geom_sankey.html).
Although it is more accurate to what I'm looking for. I don't know how to omit the stratum groups without labels (NA). In this case, I'm using my full data, not a dummy example. I can't share it but I can try to create an example if needed. This is the code I've tried:
data = bind_rows(data_2015,data_2017,data_2019) %>%
select(sip, Year, Grp) %>%
mutate(Grp = factor(Grp), Year = factor(Year)) %>%
arrange(sip) %>%
pivot_wider(names_from = Year, values_from = Grp)
df_sankey = data %>% make_long(`2015`,`2017`,`2019`)
ggplot(df_sankey, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node,
color=factor(node) )) +
geom_sankey(flow.alpha = 0.5, node.color = 1) +
geom_sankey_label(size = 3.5, color = 1, fill = "white") +
scale_fill_viridis_d() +
scale_colour_viridis_d() +
theme_sankey(base_size = 16) +
theme(legend.position = "none") + xlab('')
Figure:
Any idea how to omit the missing groups every year as stratum (without omitting them in the alluvium) will be super helpful. Thanks!
Solved! The solution was much easier I though. I'll leave here the solution in case someone else struggles with a similar problem.
Create a wide table of counts per every group / cohort.
# Data with 3 cohorts for years 2015, 2017 and 2019
# Grp is a factor with 3 levels: 1 to 6
# sip is a unique ID
library(tidyverse)
data_wide = data %>%
select(sip, Year, Grp) %>%
mutate(Grp = factor(Grp, levels=c(1:6)), Year = factor(Year)) %>%
arrange(sip) %>%
pivot_wider(names_from = Year, values_from = Grp)
Using ggsankey package we can transform it as the specific type the package expects. There's already an useful function for this.
df_sankey = data %>% make_long(`2015`,`2017`,`2019`)
# The tibble accounts for every change in X axis and Y categorical value (node):
> head(df_sankey)
# A tibble: 6 × 4
x node next_x next_node
<fct> <chr> <fct> <chr>
1 2015 3 2017 2
2 2017 2 2019 2
3 2019 2 NA NA
4 2015 NA 2017 1
5 2017 1 2019 1
6 2019 1 NA NA
Looks like using the pivot_wider() to pass it to make_long() created a situation where each combination for every value was completed, including missings as NA. Drop NA values in 'node' and create the plot.
df_sankey %>% drop_na(node) %>%
ggplot(aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node,
color=factor(node) )) +
geom_sankey(flow.alpha = 0.5, node.color = 1) +
geom_sankey_label(size = 3.5, color = 1, fill = "white") +
scale_fill_viridis_d() +
scale_colour_viridis_d() +
theme_sankey(base_size = 16) +
theme(legend.position = "none") + xlab('')
Solved!

How to create layered line plots looping over a variable

City Year Month Deaths Guns Shootings
Miami 2010 1 69 73800 701
Miami 2010 2 99 85050 738
Miami 2010 3 122 92650 784
Houston 2010 1 98 92100 789
Houston 2010 2 146 103900 799
Houston 2010 3 162 136100 772
For each city, I want to create a layered line plot with the Month and Year on the x-axis and colored lines corresponding to deaths, guns, and shootings. But I don't see how to do this.
So far I tried
df <- shootings %>%
select(city, date, sales, volume, median, listings, inventory) %>%
gather(key = "variable", value = "value", -c(city,date))
head(df)
df<-df[df$city=='Miami',]
ggplot(df, aes(x = date, y = value)) +
geom_line(aes(color = variable, linetype = variable))
And that can give me the layered line plot for one specific city, but is there a way to loop this through all the cities in my dataframe? Also how do I have this show the month and year labeled on the x-axis?
Last...
when I do
library(ggplot2)
library(reshape2)
library(tidyverse)
for (city_name in df$city){
df %>% filter(city == city_name) %>%
pivot_longer(-c(city, year, month, date),
names_to = "Statistic") %>%
ggplot(aes(x = date, y = log(value))) +
geom_line(aes(color = Statistic,
linetype = Statistic,
group = Statistic))
}
I get no output. Why is this?
Using facet_wrap():
library(tidyverse)
df <- read.table(text = "City Year Month Deaths Guns Shootings
Miami 2010 1 69 73800 701
Miami 2010 2 99 85050 738
Miami 2010 3 122 92650 784
Houston 2010 1 98 92100 789
Houston 2010 2 146 103900 799
Houston 2010 3 162 136100 772 ",
header = TRUE)
df %>%
mutate(Month = factor(Month, labels = month.name[1:3])) %>%
pivot_longer(-c(City, Year, Month),
names_to = "Statistic") %>%
ggplot(aes(x = Month, y = value)) +
geom_line(aes(color = Statistic,
linetype = Statistic,
group = Statistic)) +
facet_wrap(City ~ Year)
# The difference between the number of guns and the number of
# death is pretty large; perhaps plot 'log(value)' instead?
df %>%
mutate(Month = factor(Month, labels = month.name[1:3])) %>%
pivot_longer(-c(City, Year, Month),
names_to = "Statistic") %>%
ggplot(aes(x = Month, y = log(value))) +
geom_line(aes(color = Statistic,
linetype = Statistic,
group = Statistic)) +
facet_wrap(~ City)
# For Month and Year on the x-axis, one option would be:
df %>%
mutate(Month = factor(Month, labels = month.abb[1:3])) %>%
mutate(Month_Year = paste(Month, Year)) %>%
mutate(Date = factor(Month_Year,
levels = paste(
month.abb,
2010:2010,
sep = " "
), ordered = TRUE)) %>%
select(-Month_Year) %>%
pivot_longer(-c(City, Year, Month, Date),
names_to = "Statistic") %>%
ggplot(aes(x = Date, y = log(value))) +
geom_line(aes(color = Statistic,
linetype = Statistic,
group = Statistic)) +
facet_wrap(~ City)
Created on 2022-07-03 by the reprex package (v2.0.1)

Dygraphs in R: Plot Ribbon and mean line of different groups

I recently started working dygraphs in R, and wanted to achieve a ribbon line plot with it.
Currently, I have the below ggplot which displays a ribbon (for data from multiple batches over time) and its median for two groups. Below is the code for it.
ggplot(df,
aes(x=variable, y=A, color=`[category]`, fill = `[category]`)) +
stat_summary(geom = "ribbon", alpha = 0.35) +
stat_summary(geom = "line", size = 0.9) +
theme_minimal()+ labs(x="TimeStamp")
I could add the median solid line on the dygraph, but I'm unable to add the ribbon to it. Below is the dygraph and my code for it.
df_Medians<- df%>%
group_by(variable,`[category]`) %>%
summarise(A = median(A[!is.na(A)]))
median <- cbind(as.ts(df_Medians$A))
dygraph(median) %>%
dyRangeSelector()
Is there anyway to plot something similar to the above ggplot on dygraphs? Thanks in advance.
See if the following serves your purpose:
ggplot code (for mean, replace median_se with mean_se in the stat_summary layers):
library(ggplot2)
ggplot(df,
aes(x=variable, y=A, color=category, fill = category)) +
stat_summary(geom = "ribbon", alpha = 0.35, fun.data = median_se) +
stat_summary(geom = "line", size = 0.9, fun.data = median_se) +
theme_minimal()
dygraph code (for mean, replace median_se with mean_se in the summarise step):
library(dplyr)
library(dygraph)
# calculate summary statistics for each category, & spread results out such that each row
# corresponds to one position on the x-axis
df_dygraph <- df %>%
group_by(variable, category) %>%
summarise(data = list(median_se(A))) %>%
ungroup() %>%
tidyr::unnest(data) %>%
mutate(category = as.integer(factor(category))) %>% # optional: standardizes the column
# names for summary stats
tidyr::pivot_wider(id_cols = variable, names_from = category,
values_from = c(ymin, y, ymax))
> head(df_dygraph)
# A tibble: 6 x 7
variable ymin_1 ymin_2 y_1 y_2 ymax_1 ymax_2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 3817. 2712. 4560. 2918. 5304. 3125.
2 2 3848. 2712. 4564. 2918. 5279. 3125.
3 3 3847. 2826. 4564 2961 5281. 3096.
4 4 3722. 2827. 4331 2962. 4940. 3098.
5 5 3833. 2831. 4570. 2963 5306. 3095.
6 6 3835. 2831. 4572 2964 5309. 3097.
dygraph(df_dygraph, main = "Dygraph title") %>%
dySeries(c("ymin_1", "y_1", "ymax_1"), label = "Category 1") %>%
dySeries(c("ymin_2", "y_2", "ymax_2"), label = "Category 2") %>%
dyRangeSelector()
Code for median counterpart of mean_se:
median_se <- function(x) {
x <- na.omit(x)
se <- sqrt(var(x) / length(x))
med <- median(x)
ggplot2:::new_data_frame(list(y = med,
ymin = med - se,
ymax = med + se),
n = 1)
}
Sample data:
df <- diamonds %>%
select(price, cut) %>%
filter(cut %in% c("Fair", "Ideal")) %>%
group_by(cut) %>%
slice(1:1000) %>%
mutate(variable = rep(seq(1, 50), times = 20)) %>%
ungroup() %>%
rename(A = price, category = cut)

R pivot_longer and ggplot errorbar with two name/key columns

Let's assume we have the following artifical data:
df <- data.frame(Year = c(2015,2016,2017,2018),
GPP_mean = c(1700,1800,1750,1850),
Reco_mean = c(-1700,-1800,-1750,-1850),
GPP_min = c(1600,1700,1650,1750),
GPP_max = c(1800,1900,1850,1950),
Reco_min = c(-1600,-1700,-1650,-1750),
Reco_max = c(-1800,-1900,-1850,-1950))
I'd like to plot bars for each mean value and use the min/max columns for the errorbar.
This is what I've achieved so far:
df %>%
pivot_longer(cols = -Year,
names_to = c("variable", "stats"),
names_sep = "_")
Which gives us:
# A tibble: 24 x 4
Year variable stats value
<dbl> <chr> <chr> <dbl>
1 2015 GPP mean 1700
2 2015 Reco mean -1700
3 2015 GPP min 1600
4 2015 GPP max 1800
5 2015 Reco min -1600
6 2015 Reco max -1800
7 2016 GPP mean 1800
8 2016 Reco mean -1800
9 2016 GPP min 1700
10 2016 GPP max 1900
# … with 14 more rows
So far, so good (I guess?).
From here on, I have no clue of how I can tell ggplot to plot the mean values as the bars and use min/max for the errorbars. Any help appreciated, thanks.
additional solution using tidyverse
library(tidyverse)
out <- df %>%
pivot_longer(-Year, names_sep = "_", names_to = c("index", ".value"))
ggplot(out, aes(Year, mean, fill = index)) +
geom_col() +
geom_errorbar(aes(ymin = min, ymax = max), width = 0.5)
You should stick with your original data frame. There's no need to pivot longer for this:
ggplot(df, aes(Year, GPP_mean)) +
geom_col(fill = "forestgreen", colour = "black") +
geom_errorbar(aes(ymin = GPP_min, ymax = GPP_max), width = 0.5) +
geom_col(aes(y = Reco_mean), fill = "red", colour = "black", position = "dodge") +
geom_errorbar(aes(ymin = Reco_max, ymax = Reco_min), width = 0.5)

Mean across each element of a tibble list-column by group with purrr and dplyr

I'm trying to get used to using tidyverse. I don't know if my data is well suited for using functions like map(). I like the organization of list-columns so I am wondering how to use a combination of group_by(), summarize(), map(), and other functions to get this to work. I know how to use these functions with vector-columns but do not know how to approach this in the case of list-columns.
Sample data:
library(tidyverse)
set.seed(3949)
myList <- replicate(12, sample(1:20, size = 10), simplify = FALSE)
tibble(
group = rep(c("A", "B"), each = 6),
data = myList
)
Each vector in the list-column has ten elements which are values for a given trial. What I would like to do is group the tibble by group and then find the "column" mean and se of the expanded lists. In other words, it's like I'm treating the list columns as a matrix with each row of the tibble bound together. The output will have columns for the group and trials as well so it is in the correct format for ggplot2.
mean se group trial
1 6.000000 1.6329932 A 1
2 12.666667 2.3333333 A 2
3 12.333333 2.8007935 A 3
4 13.833333 1.8150605 A 4
5 8.166667 3.1028661 A 5
6 11.500000 2.9410882 A 6
7 13.666667 2.3758040 A 7
8 6.833333 1.7779514 A 8
9 11.833333 2.3009660 A 9
10 8.666667 1.7061979 A 10
11 8.333333 1.6865481 B 1
12 12.166667 2.6002137 B 2
13 10.000000 2.7080128 B 3
14 11.833333 3.1242777 B 4
15 4.666667 1.2823589 B 5
16 12.500000 3.0413813 B 6
17 6.000000 1.5055453 B 7
18 8.166667 1.6616591 B 8
19 11.000000 2.6708301 B 9
20 13.166667 0.9457507 B 10
Here is how I would normally do something like this:
set.seed(3949)
data.frame(group = rep(c("A", "B"), each = 6)) %>%
cbind(replicate(12, sample(1:20, size = 10)) %>% t()) %>%
split(.$group) %>%
lapply(function(x) data.frame(mean = colMeans(x[ ,2:11]),
se = apply(x[ ,2:11], 2, se))) %>%
do.call(rbind,.) %>%
mutate(group = substr(row.names(.), 1,1),
trial = rep(1:10, 2)) %>%
ggplot(aes(x = trial, y = mean)) +
geom_point() +
geom_line() +
facet_grid(~ group) +
scale_x_continuous(limits = c(1,10), breaks = seq(1, 10, 1)) +
geom_errorbar(aes(ymin = mean-se, ymax = mean+se), color = "black") +
theme_bw()
Is there are cleaner way to do this with the tidyverse functions?
I think that another way is to use nest() and map().
library(tidyverse)
library(plotrix) #For the std.error
# Your second sample dataset
set.seed(3949)
df <- data.frame(group = rep(c("A", "B"), each = 6)) %>%
cbind(replicate(12, sample(1:20, size = 10)) %>% t())
df %>%
nest(-group) %>%
mutate(mean = map(data, ~rowMeans(.)),
se = map(data, ~ plotrix::std.error(t(.))),
trial = map(data, ~ seq(1, nrow(.)))) %>%
unnest(mean, se, trial) %>%
ggplot(aes(x = trial, y = mean)) +
geom_point() +
geom_line() +
facet_grid(~ group) +
geom_errorbar(aes(ymin = mean-se, ymax = mean+se), color = "black") +
theme_bw()

Resources