Add means to histograms by group in ggplot2 - r

I am following this source to do histograms by group in ggplot2.
The sample data looks like this:
set.seed(3)
x1 <- rnorm(500)
x2 <- rnorm(500, mean = 3)
x <- c(x1, x2)
group <- c(rep("G1", 500), rep("G2", 500))
df <- data.frame(x, group = group)
And the code:
# install.packages("ggplot2")
library(ggplot2)
# Histogram by group in ggplot2
ggplot(df, aes(x = x, fill = group, colour = group)) +
geom_histogram(alpha = 0.5, position = "identity")
I know that adding a line like:
+geom_vline(aes(xintercept=mean(group),color=group,fill=group), col = "red")
Should allow me to get what I am looking for, but I am obtaining just an histogram with one mean, not a mean by group:
Do you have any suggestions?

I would compute the mean into the dataframe:
library(ggplot2)
library(dplyr)
df %>%
group_by(group) %>%
mutate(mean_x = mean(x))
output is:
# A tibble: 1,000 × 3
# Groups: group [2]
x group mean_x
<dbl> <chr> <dbl>
1 -0.962 G1 0.0525
2 -0.293 G1 0.0525
3 0.259 G1 0.0525
4 -1.15 G1 0.0525
5 0.196 G1 0.0525
6 0.0301 G1 0.0525
7 0.0854 G1 0.0525
8 1.12 G1 0.0525
9 -1.22 G1 0.0525
10 1.27 G1 0.0525
# … with 990 more rows
So do:
library(ggplot2)
library(dplyr)
df %>%
group_by(group) %>%
mutate(mean_x = mean(x)) %>%
ggplot(aes(x, fill = group, colour = group)) +
geom_histogram(alpha = 0.5, position = "identity") +
geom_vline(aes(xintercept = mean_x), col = "red")
Output is:

In addition to the previous suggestion, you can also use separately stored group means, i. e. two instead of nrow=1000 highly redundant values:
## a 'tidy' (of several valid ways for groupwise calculation):
group_means <- df %>%
group_by(group) %>%
summarise(group_means = mean(x, na.rm = TRUE)) %>%
pull(group_means)
## ... ggplot code ... +
geom_vline(xintercept = group_means)

A straightforward method without precomputation would be:
ggplot(df, aes(x = x, fill = group, colour = group)) +
geom_histogram(alpha = 0.5, position = "identity") +
geom_vline(xintercept = tapply(df$x, df$group, mean), col = "red")

Related

How to visualize multiple bar plots in one (or splitted) pdf

I'm using the tidyverse-ggplot2 combination to plot multiple bar plots. In one of my comparisons i would like to have even up to 300 single plots. I was wondering if there is a possibility to make sure that the plots will be visible in the pdf file and not look like the attached example
If possible I would prefer to have all the plots in one single pdf file, but if not, also multiple pages will be ok.
The command to plot the bar charts is
common %>%
as_tibble(rownames="gene") %>%
left_join(x= ., y = up[,1:2], by = c("gene" = "ensembl_gene_id") ) %>%
pivot_longer(starts_with("S"), names_to="sample", values_to="counts") %>%
left_join(groups, by="sample") %>%
group_by(mgi_symbol, group, cond, time) %>%
summarize(mean_count=mean(counts)) %>%
ggplot( aes(x = time, y = mean_count, fill=cond)) +
geom_bar(stat = "identity", position = position_dodge(width=0.9) ) +
scale_fill_manual(values=c("darkblue", "lightblue", "black")) +
facet_wrap(~mgi_symbol, scales = "free", ncol = 5) +
theme_bw()
I forgot to add the group table
groups <- tibble(
sample= colnames(normCounts),
group = rep(seq(1, ncol(normCounts)/3), each=3),
cond = rep(c("WT", "GCN2-KO", "GCN1-KO"), each = 12),
time = rep(rep(c("0h", "1h", "4h", "8h"), each=3), times = 3 )
)
thanks
Adding the command with the group_map was as such
common %>%
as_tibble(rownames="gene") %>%
left_join(x= ., y = up[,1:2], by = c("gene" = "ensembl_gene_id") ) %>%
pivot_longer(starts_with("S"), names_to="sample", values_to="counts") %>%
left_join(groups, by="sample") %>%
group_by(mgi_symbol, group, cond, time) %>%
summarize(mean_count=mean(counts)) %>%
group_map(function(g, ...)
ggplot(g, aes(x = time, y = mean_count, fill=cond)) +
geom_bar(stat = "identity", position = position_dodge(width=0.9) ) +
scale_fill_manual(values=c("darkblue", "lightblue", "black")) +
facet_wrap(~mgi_symbol, scales = "free", ncol = 5) +
theme_bw()
)
EDIT
This is how the data looks like in the input table (after summarizing the means)
df <-
common %>%
as_tibble(rownames="gene") %>%
left_join(x= ., y = up[,1:2], by = c("gene" = "ensembl_gene_id") ) %>%
pivot_longer(starts_with("S"), names_to="sample", values_to="counts") %>%
left_join(groups, by="sample") %>%
group_by(mgi_symbol, group, cond, time) %>%
summarize(mean_count=mean(counts)) %>%
ungroup()
df
#>`summarise()` regrouping output by 'mgi_symbol', 'group', 'cond' (override with `.groups` argument)
#> # A tibble: 1,212 x 5
#> mgi_symbol group cond time mean_count
#> <chr> <int> <chr> <chr> <dbl>
#> 1 0610031O16Rik 1 WT 0h 14.4
#> 2 0610031O16Rik 2 WT 1h 30.9
#> 3 0610031O16Rik 3 WT 4h 45.5
#> 4 0610031O16Rik 4 WT 8h 56.0
#> 5 0610031O16Rik 5 GCN2-KO 0h 18.9
#> 6 0610031O16Rik 6 GCN2-KO 1h 39.4
#> 7 0610031O16Rik 7 GCN2-KO 4h 13.9
#> 8 0610031O16Rik 8 GCN2-KO 8h 13.3
#> 9 0610031O16Rik 9 GCN1-KO 0h 12.3
#> 10 0610031O16Rik 10 GCN1-KO 1h 25.3
#> # … with 1,202 more rows
Start with some dummy data. This is the data after you've finished running left_join, pivot_longer, group_by, summarize.
library(tidyverse)
df <- tibble(
time = 1:5,
mean_count = 1:5,
cond = "x"
) %>%
expand_grid(mgi_symbol = c(letters, LETTERS))
Create a column group which represents what page the mgi_symbol belongs on.
plots_per_page <- 20
df <-
df %>%
mutate(group = (dense_rank(mgi_symbol) - 1) %/% plots_per_page)
Create all the plots with group_map.
plots <-
df %>%
group_by(group) %>%
group_map(function(g, ...) {
ggplot(g, aes(x = time, y = mean_count, fill=cond)) +
geom_bar(stat = "identity", position = position_dodge(width=0.9) ) +
scale_fill_manual(values=c("darkblue", "lightblue", "black")) +
facet_wrap(~mgi_symbol, scales = "free", ncol = 5) +
theme_bw()
})
Save as multiple pages using ggpubr
ggpubr::ggexport(
ggpubr::ggarrange(plotlist = plots, nrow = 1, ncol = 1),
filename = "plots.pdf"
)

Dygraphs in R: Plot Ribbon and mean line of different groups

I recently started working dygraphs in R, and wanted to achieve a ribbon line plot with it.
Currently, I have the below ggplot which displays a ribbon (for data from multiple batches over time) and its median for two groups. Below is the code for it.
ggplot(df,
aes(x=variable, y=A, color=`[category]`, fill = `[category]`)) +
stat_summary(geom = "ribbon", alpha = 0.35) +
stat_summary(geom = "line", size = 0.9) +
theme_minimal()+ labs(x="TimeStamp")
I could add the median solid line on the dygraph, but I'm unable to add the ribbon to it. Below is the dygraph and my code for it.
df_Medians<- df%>%
group_by(variable,`[category]`) %>%
summarise(A = median(A[!is.na(A)]))
median <- cbind(as.ts(df_Medians$A))
dygraph(median) %>%
dyRangeSelector()
Is there anyway to plot something similar to the above ggplot on dygraphs? Thanks in advance.
See if the following serves your purpose:
ggplot code (for mean, replace median_se with mean_se in the stat_summary layers):
library(ggplot2)
ggplot(df,
aes(x=variable, y=A, color=category, fill = category)) +
stat_summary(geom = "ribbon", alpha = 0.35, fun.data = median_se) +
stat_summary(geom = "line", size = 0.9, fun.data = median_se) +
theme_minimal()
dygraph code (for mean, replace median_se with mean_se in the summarise step):
library(dplyr)
library(dygraph)
# calculate summary statistics for each category, & spread results out such that each row
# corresponds to one position on the x-axis
df_dygraph <- df %>%
group_by(variable, category) %>%
summarise(data = list(median_se(A))) %>%
ungroup() %>%
tidyr::unnest(data) %>%
mutate(category = as.integer(factor(category))) %>% # optional: standardizes the column
# names for summary stats
tidyr::pivot_wider(id_cols = variable, names_from = category,
values_from = c(ymin, y, ymax))
> head(df_dygraph)
# A tibble: 6 x 7
variable ymin_1 ymin_2 y_1 y_2 ymax_1 ymax_2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 3817. 2712. 4560. 2918. 5304. 3125.
2 2 3848. 2712. 4564. 2918. 5279. 3125.
3 3 3847. 2826. 4564 2961 5281. 3096.
4 4 3722. 2827. 4331 2962. 4940. 3098.
5 5 3833. 2831. 4570. 2963 5306. 3095.
6 6 3835. 2831. 4572 2964 5309. 3097.
dygraph(df_dygraph, main = "Dygraph title") %>%
dySeries(c("ymin_1", "y_1", "ymax_1"), label = "Category 1") %>%
dySeries(c("ymin_2", "y_2", "ymax_2"), label = "Category 2") %>%
dyRangeSelector()
Code for median counterpart of mean_se:
median_se <- function(x) {
x <- na.omit(x)
se <- sqrt(var(x) / length(x))
med <- median(x)
ggplot2:::new_data_frame(list(y = med,
ymin = med - se,
ymax = med + se),
n = 1)
}
Sample data:
df <- diamonds %>%
select(price, cut) %>%
filter(cut %in% c("Fair", "Ideal")) %>%
group_by(cut) %>%
slice(1:1000) %>%
mutate(variable = rep(seq(1, 50), times = 20)) %>%
ungroup() %>%
rename(A = price, category = cut)

Barplot Indicating the statistically significant difference

I need to draw a bar plot for significant SNP codes (categorical) against the corresponding phenotype, similar to these plots:
I tried many ways in R and got some results but I field to got my favorite result. Here are the codes and results:
### DATA
SNP_code <- as.factor(c("GG","GA","AA","GA","GA","GG","GG","GG","GG","GA","GA","AA","GA","GA","GA","GG","GG","GG","GG","AA","GG","GG","GG","GG","AA","GG","GG","GA","GG","AA","GA","GG","GG","GG","GG","GG","GG","AA","GG","GA","GG","GG","GA","GG","GG","GA","GG","GG","GA","GA","GG","GA","GG","GA","GA","GA","GA","GA","GA","GG","GG","GG","AA","GA","GA","GA","GA","GG","GA","GG","GG","GG","GA","GA","GA","GG","GG","GA","GG","AA","GG","GG","GG","AA"))
EBV <- c(0.06663,-0.03031,-0.122,-0.02021,-0.1157,-0.08131,-0.02034,-0.06324,0.06699,-0.062,0.02736,-0.1201,-0.04846,-0.06934,-0.06023,-0.009244,-0.05648,-0.01908,0.06728,-0.06517,0.08534,0.07618,-0.0814,0.06113,-0.0795,0.1055,0.08305,0.1209,-0.05314,-0.09431,0.05185,0.1347,0.1591,0.08777,0.08326,0.1612,0.09528,-0.1002,0.1561,-0.09327,0.09474,0.1356,0.06384,0.1585,0.03235,0.1081,0.1462,-0.04082,-0.05042,0.01793,-0.1157,-0.1165,-0.009399,-0.02311,-0.108,-0.1143,0.07219,0.01376,-0.05059,-0.052,0.08494,-0.0388,-0.06346,0.07789,0.02961,-0.1126,0.1102,0.133,-0.09317,-0.1181,0.1584,0.122,0.1019,-0.04074,-0.01178,0.09523,-0.03266,-0.01258,-0.0231,-0.08259,0.05823,-0.02894,-0.008242,0.07981)
LS <- c(2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,1,2,1,1,2,1,2,2,2,1,1,1,2,2,2,2,2,1,1,2,1,2,2,2,1,2,2,2,1,1,2,1,1,1,1,1,1,1,1,1,1,2,1,1,2,1,1,2,2,1,1,2,1,2,1,1,2,1,1,1,1,1,1,1,2)
IDs <- c(1033,1081,1106,1107,1120,1194,1199,1326,1334,1340,1345,1358,1398,1404,1405,1421,1457,1459,1464,1509,1529,1542,1550,2025,2030,2095,2099,2128,2141,2153,2167,2224,2232,2238,2244,2266,2271,2280,2283,2323,2326,2337,2369,2390,2391,2396,851012,851016,851021,851055,851063,851084,851105,851109,851146,851169,851176,851198,851205,851246,851266,851292,851332,851345,851488,851489,851509,851528,851531,851547,851562,851573,851574,851578,851584,851588,851592,851622,851651,851670,851672,851684,851690,861086)
sig_snp <- data.frame(IDs, SNP_code, EBV, LS)
### Variance analysis and Mean comparison
library(dplyr)
### for LS
group_by(sig_snp, SNP_code) %>%
summarise(
count = n(),
mean = mean(LS, na.rm = TRUE),
sd = sd(LS, na.rm = TRUE))
### for EBV
group_by(sig_snp, SNP_code) %>%
summarise(
count = n(),
mean = mean(EBV, na.rm = TRUE),
sd = sd(EBV, na.rm = TRUE))
# Compute the analysis of variance
Anova.fit <- aov(EBV ~ SNP_code, data = sig_snp)
summary(Anova.fit)
# Tukey multiple pairwise-comparisons
TukeyHSD(Anova.fit)
# or
library(multcomp)
summary(glht(Anova.fit, linfct = mcp(SNP_code = "Tukey")))
### Box plot for EBV (actually I need Barplot for LS and EBV)
library(ggplot2)
library(ggpval)
plot <- ggplot(sig_snp, aes(SNP_code, EBV)) +
geom_boxplot(fill=c("red","blue", "green"), color="black", width=.7); plot
add_pval(plot, pairs = list(c(1, 3)), test='wilcox.test')
add_pval(plot, pairs = list(c(2, 3)), test='wilcox.test')
add_pval(plot, pairs = list(c(1, 2)), test='wilcox.test')
"add_pval" only use "wilcox.test" and "t.test", but I perfer Tukey.
Any help is appreciated.
There is definitely room for improvement of the code that I posted below, but at least it gives you one example of the workflow you can use for getting your "favorite" barplot:
Part A: Barchart
1) We re-organise sig_snp in order to get a dataframe with the mean of each SNP in function of EBV or LS.
library(tidyverse)
DF1 <- sig_snp %>%
pivot_longer(., cols = c(EBV,LS), names_to = "Variable", values_to = "Values") %>%
group_by(SNP_code, Variable) %>%
summarise(Mean = mean(Values),
SEM = sd(Values) / sqrt(n()),
Nb = n()) %>%
rowwise() %>%
mutate(Labels = as.character(SNP_code)) %>%
mutate(Labels = paste(unlist(strsplit(Labels,"")),collapse = "/")) %>%
mutate(Labels = paste0(Labels,"\nn = ",Nb))
# A tibble: 6 x 6
SNP_code Variable Mean SEM Nb Labels
<fct> <chr> <dbl> <dbl> <int> <chr>
1 AA EBV -0.0719 0.0202 9 "A/A\nn = 9"
2 AA LS 1.11 0.111 9 "A/A\nn = 9"
3 GA EBV -0.0141 0.0134 31 "G/A\nn = 31"
4 GA LS 1.23 0.0763 31 "G/A\nn = 31"
5 GG EBV 0.0422 0.0126 44 "G/G\nn = 44"
6 GG LS 1.48 0.0762 44 "G/G\nn = 44"
The labels column will be re-used later for the labeling of x-axis.
2) Then, we are going to calculate the total mean (that will hep to draw the "Mean" bar) by doing:
library(tidyverse)
DF2 <- sig_snp %>%
pivot_longer(., cols = c(EBV,LS), names_to = "Variable", values_to = "Values") %>%
group_by(Variable) %>%
summarise(Mean = mean(Values),
SEM = sd(Values) / sqrt(n()),
Nb = n()) %>%
mutate(SNP_code = "All") %>%
select(SNP_code, Variable, Mean, SEM, Nb) %>%
rowwise() %>%
mutate(Labels = paste0("Mean\nn = ",Nb))
# A tibble: 2 x 6
SNP_code Variable Mean SEM Nb Labels
<chr> <chr> <dbl> <dbl> <int> <chr>
1 All EBV 0.00918 0.00944 84 "Mean\nn = 84"
2 All LS 1.35 0.0522 84 "Mean\nn = 84"
3) we are binding both DF1 and DF2 and we re-organize the levels of SNP_code in order to get the correct plotting order:
library(tidyverse)
DF <- bind_rows(DF1, DF2)
DF$Labels = factor(DF$Labels,levels= c("Mean\nn = 84",
"A/A\nn = 9",
"G/A\nn = 31",
"G/G\nn = 44" ))
4) Now, we can plot it:
library(ggplot2)
ggplot(DF, aes(x = SNP_code, y = Mean, fill = SNP_code))+
geom_bar(stat = "identity", show.legend = FALSE)+
geom_errorbar(aes(ymin = Mean-SEM, ymax = Mean+SEM), width = 0.2)+
facet_wrap(.~Variable, scales = "free")+
scale_x_discrete(name = "",labels = levels(DF$Labels))
Part B: Adding statistic on the barchart
For adding statistic, you can have the use of geom_signif function from ggsignif package that allow to add statistics from an external output.
1) First create the dataframe for the output of Tukey test on EBV:
Anova.fit <- aov(EBV ~ SNP_code, data = sig_snp)
t <- TukeyHSD(Anova.fit)
stat <- t$SNP_code
Stat_EBV <- stat %>% as.data.frame() %>%
mutate(Variable = "EBV") %>%
mutate(Group = rownames(stat)) %>%
rowwise() %>%
mutate(Group1 = unlist(strsplit(Group,"-"))[1]) %>%
mutate(Group2 = unlist(strsplit(Group,"-"))[2]) %>%
mutate(labels = round(`p adj`,4))
Stat_EBV$y_pos <- c(0.06,0.08,0.1)
2) same thing for the Tukey test of LS:
Anova.fit <- aov(LS ~ SNP_code, data = sig_snp)
t <- TukeyHSD(Anova.fit)
stat <- t$SNP_code
Stat_LS <- stat %>% as.data.frame() %>%
mutate(Variable = "LS") %>%
mutate(Group = rownames(stat)) %>%
rowwise() %>%
mutate(Group1 = unlist(strsplit(Group,"-"))[1]) %>%
mutate(Group2 = unlist(strsplit(Group,"-"))[2]) %>%
mutate(labels = round(`p adj`,4))
Stat_LS$y_pos = c(1.7,1.9,2.1)
3) Binding of both stats dataframes:
library(tidyverse)
STAT <- bind_rows(Stat_EBV,Stat_LS)
# A tibble: 6 x 10
diff lwr upr `p adj` Variable Group Group1 Group2 labels y_pos
<dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 0.0578 -0.0130 0.129 0.132 EBV GA-AA GA AA 0.132 0.06
2 0.114 0.0457 0.183 0.000431 EBV GG-AA GG AA 0.0004 0.08
3 0.0563 0.0125 0.100 0.00821 EBV GG-GA GG GA 0.0082 0.1
4 0.115 -0.303 0.532 0.790 LS GA-AA GA AA 0.790 1.7
5 0.366 -0.0373 0.770 0.0832 LS GG-AA GG AA 0.0832 1.9
6 0.251 -0.00716 0.510 0.0585 LS GG-GA GG GA 0.0585 2.1
4) Get the barchart and add the statistic results:
library(ggplot2)
library(ggsignif)
ggplot(DF, aes(x = SNP_code, y = Mean, fill = SNP_code))+
geom_bar(stat = "identity", show.legend = FALSE)+
geom_errorbar(aes(ymin = Mean-SEM, ymax = Mean+SEM), width = 0.2)+
geom_signif(inherit.aes = FALSE, data = STAT,
aes(xmin=Group1, xmax=Group2, annotations=labels, y_position=y_pos),
manual = TRUE)+
facet_wrap(.~Variable, scales = "free")+
scale_x_discrete(name = "",labels = levels(DF$Labels))
I hope it looks what you are expecting.

Add an arrow with slope and error to a geom_point plot

I have xy data from two groups, where each point also has corresponding xend and yend coordinates which indicate where an arrow starting at that point ends:
set.seed(1)
df <- data.frame(x=c(rnorm(50,-1,0.5),rnorm(50,1,0.5)),y=c(rnorm(50,-1,0.5),rnorm(50,1,0.5)),group=c(rep("A",50),rep("B",50)))
df$arrow.x.end <- c(df$x[1:50]+runif(50,0,0.25),df$x[51:100]-runif(50,0,0.25))
df$arrow.y.end <- c(df$y[1:50]+runif(50,0,0.25),df$y[51:100]-runif(50,0,0.25))
The arrows of group A generally point towards group B and vice versa:
library(ggplot2)
ggplot(df,aes(x=x,y=y,color=group))+geom_point()+theme_minimal()+
geom_segment(aes(x=x,y=y,xend=arrow.x.end,yend=arrow.y.end),arrow=arrow())+
theme(legend.position="none")
I'm looking for a way to plot the points with only two arrows, one per each group.
The arrows will start at centroids of each group, will have a slope which is the median slope of each group. Ideally, the arrows will also have the standard errors of the median slope of each group as polygons.
Here's what I doing so far:
library(dplyr)
slope.df <- df %>%
dplyr::group_by(group) %>%
dplyr::mutate(slope=(arrow.y.end-y)/abs((arrow.x.end-x)),length=sqrt((arrow.y.end-y)^2+(arrow.x.end-x)^2)) %>%
dplyr::summarise(slope.median=mean(slope),
slope.median.se=1.2533*(sd(slope)/sqrt(n())),
median.length=median(length),
x.start=median(x),y.start=median(y)) %>%
dplyr::mutate(x.end=x.start+sign(slope.median)*(median.length/sqrt(2))) %>%
dplyr::mutate(y.end=sign(slope.median)*((x.end-x.start)*slope.median))
Computing the slope of each arrow and its length. And then per each group the median slope, standard error of the median slope, and the median length. Right now I'm computing xend and yend of the median arrow as:
median.length^2 <- xend^2 + xend^2
But I use something else.
So plotting this:
ggplot(df,aes(x=x,y=y,color=group))+geom_point()+theme_minimal()+theme(legend.position="none")+
geom_segment(aes(x=x.start,y=y.start,xend=x.end,yend=y.end),arrow=arrow(),data=slope.df)
Gives:
Any advice if there's a better way of doing this and also how to add the standard error polygon?
calculate the mean of x and y for each periode
df2 <- df %>%
select( -c(4,5) ) %>%
mutate( period = 0 ) %>%
rbind( data.frame( x = df$arrow.x.end,
y = df$arrow.y.end,
group = c( rep( "A", 50 ),rep( "B" , 50 ) ),
period = 1)
) %>%
group_by( group, period ) %>%
summarise_all( mean )
# # A tibble: 4 x 4
# # Groups: group [2]
# group period x y
# <fct> <dbl> <dbl> <dbl>
# 1 A 0 -0.950 -1.08
# 2 A 1 -0.816 -0.942
# 3 B 0 1.06 1.04
# 4 B 1 0.940 0.905
plot, use stat_smooth to draw a line throigh the 'mean' of the clouds
ggplot( data = df2, aes( x = x, y = y, colour = group ) ) +
stat_smooth(se = TRUE, method = lm, fullrange = TRUE) +
geom_point( data = df, aes(x = x, y = y, colour = group, fill = group ) ) +
geom_point( data = df, aes(x = arrow.x.end, y = arrow.y.end, colour = group, fill = group), alpha = 0.5 )

Mean across each element of a tibble list-column by group with purrr and dplyr

I'm trying to get used to using tidyverse. I don't know if my data is well suited for using functions like map(). I like the organization of list-columns so I am wondering how to use a combination of group_by(), summarize(), map(), and other functions to get this to work. I know how to use these functions with vector-columns but do not know how to approach this in the case of list-columns.
Sample data:
library(tidyverse)
set.seed(3949)
myList <- replicate(12, sample(1:20, size = 10), simplify = FALSE)
tibble(
group = rep(c("A", "B"), each = 6),
data = myList
)
Each vector in the list-column has ten elements which are values for a given trial. What I would like to do is group the tibble by group and then find the "column" mean and se of the expanded lists. In other words, it's like I'm treating the list columns as a matrix with each row of the tibble bound together. The output will have columns for the group and trials as well so it is in the correct format for ggplot2.
mean se group trial
1 6.000000 1.6329932 A 1
2 12.666667 2.3333333 A 2
3 12.333333 2.8007935 A 3
4 13.833333 1.8150605 A 4
5 8.166667 3.1028661 A 5
6 11.500000 2.9410882 A 6
7 13.666667 2.3758040 A 7
8 6.833333 1.7779514 A 8
9 11.833333 2.3009660 A 9
10 8.666667 1.7061979 A 10
11 8.333333 1.6865481 B 1
12 12.166667 2.6002137 B 2
13 10.000000 2.7080128 B 3
14 11.833333 3.1242777 B 4
15 4.666667 1.2823589 B 5
16 12.500000 3.0413813 B 6
17 6.000000 1.5055453 B 7
18 8.166667 1.6616591 B 8
19 11.000000 2.6708301 B 9
20 13.166667 0.9457507 B 10
Here is how I would normally do something like this:
set.seed(3949)
data.frame(group = rep(c("A", "B"), each = 6)) %>%
cbind(replicate(12, sample(1:20, size = 10)) %>% t()) %>%
split(.$group) %>%
lapply(function(x) data.frame(mean = colMeans(x[ ,2:11]),
se = apply(x[ ,2:11], 2, se))) %>%
do.call(rbind,.) %>%
mutate(group = substr(row.names(.), 1,1),
trial = rep(1:10, 2)) %>%
ggplot(aes(x = trial, y = mean)) +
geom_point() +
geom_line() +
facet_grid(~ group) +
scale_x_continuous(limits = c(1,10), breaks = seq(1, 10, 1)) +
geom_errorbar(aes(ymin = mean-se, ymax = mean+se), color = "black") +
theme_bw()
Is there are cleaner way to do this with the tidyverse functions?
I think that another way is to use nest() and map().
library(tidyverse)
library(plotrix) #For the std.error
# Your second sample dataset
set.seed(3949)
df <- data.frame(group = rep(c("A", "B"), each = 6)) %>%
cbind(replicate(12, sample(1:20, size = 10)) %>% t())
df %>%
nest(-group) %>%
mutate(mean = map(data, ~rowMeans(.)),
se = map(data, ~ plotrix::std.error(t(.))),
trial = map(data, ~ seq(1, nrow(.)))) %>%
unnest(mean, se, trial) %>%
ggplot(aes(x = trial, y = mean)) +
geom_point() +
geom_line() +
facet_grid(~ group) +
geom_errorbar(aes(ymin = mean-se, ymax = mean+se), color = "black") +
theme_bw()

Resources