Using geom_smooth for fitting a glm to fractions - r

This post is somewhat related to this post.
Here I have xy grouped data where y are fractions:
library(dplyr)
library(ggplot2)
library(ggpmisc)
set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
age = rep(c("d2","d4","d45"),3),
group = c("A","A","A","B","B","B","C","C","C")) %>%
dplyr::mutate(time = as.integer(age)) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))
What I'm trying to achieve is to plot df1 as a bar plot, like this:
ggplot(df1,aes(x=group_age,y=value,fill=age)) +
geom_bar(stat='identity')
But I want to fit to each group a binomial glm with a logit link function, which estimates how these fractions are affected by time.
Let's say I have 100 observations per each age (time) in each group:
df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(T,100*df1$value[i]),rep(F,100*(1-df1$value[i]))))
}))
Then the glm for each group (e.g., group A) is:
glm(value ~ time, dplyr::filter(df2, group == "A"), family = binomial(link='logit'))
So I would like to add to the plot above the estimated regression slopes for each group along with their corresponding p-values (similar to what I'm doing for the continuous df$value in this post).
I thought that using:
ggplot(df1,aes(x=group_age,y=value,fill=age)) +
geom_bar(stat='identity') +
geom_smooth(data=df2,mapping=aes(x=group_age,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit')),size=1,se=T) +
stat_poly_eq(aes(label=stat(p.value.label)),formula=my_formula,parse=T,npcx="center",npcy="bottom") +
scale_x_log10(name="Age",labels=levels(df$age),breaks=1:length(levels(df$age))) +
facet_wrap(~group) + theme_minimal()
Would work but I get the error:
Error in Math.factor(x, base) : ‘log’ not meaningful for factors
Any idea how to get it right?

I believe this could help:
library(tidyverse)
library(broom)
df2$value <- as.numeric(df2$value)
#Estimate coefs
dfmodel <- df2 %>% group_by(group) %>%
do(fitmodel = glm(value ~ time, data = .,family = binomial(link='logit')))
#Extract coeffs
dfCoef = tidy(dfmodel, fitmodel)
#Create labels
dfCoef %>% filter(term=='(Intercept)') %>% mutate(Label=paste0(round(estimate,3),'(p=',round(p.value,3),')'),
group_age=paste0(group,'_','d4')) %>%
select(c(group,Label,group_age)) -> Labels
#Values
df2 %>% group_by(group,group_age) %>% summarise(value=sum(value)) %>% ungroup() %>%
group_by(group) %>% filter(value==max(value)) %>% select(-group_age) -> values
#Combine
Labels %>% left_join(values) -> Labels
Labels %>% mutate(age=NA) -> Labels
#Plot
ggplot(df2,aes(x=group_age,y=value,fill=age)) +
geom_text(data=Labels,aes(x=group_age,y=value,label=Label),fontface='bold')+
geom_bar(stat='identity')+
facet_wrap(.~group,scales='free')

Thanks to Pedro Aphalo this is nearly a complete solution:
Generate the data.frame with the fractions (here use time as an integer by deleting "d" in age rather than using time as the levels of age):
library(dplyr)
library(ggplot2)
library(ggpmisc)
set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
age = rep(c("d2","d4","d45"),3),
group = c("A","A","A","B","B","B","C","C","C")) %>%
dplyr::mutate(time = as.integer(gsub("d","",age))) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))
Inflate df1 to 100 observations per each age in each group but specify value as an integer rather than a binary:
df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(1,100*df1$value[i]),rep(0,100*(1-df1$value[i]))))
}))
And now plot it using geom_smooth and stat_fit_tidy:
ggplot(df1,aes(x=time,y=value,group=group,fill=age)) +
geom_bar(stat='identity') +
geom_smooth(data=df2,mapping=aes(x=time,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit'))) +
stat_fit_tidy(data=df2,mapping=aes(x=time,y=value,group=group,label=sprintf("P = %.3g",stat(x_p.value))),method='glm',method.args=list(formula=y~x,family=binomial(link='logit')),parse=T,label.x="center",label.y="top") +
scale_x_log10(name="Age",labels=levels(df2$age),breaks=unique(df2$time)) +
facet_wrap(~group) + theme_minimal()
Which gives (note that the scale_x_log10 is mainly a cosmetic approach to presenting the x-axis as time rather than levels of age):
The only imperfection is that the p-values seem to appear messed up.

Related

two-panel scatter plot in ggplot2

For my data.frame full below, I'm wondering how to create a two-panel geom_point such that on the first panel, we have ols.(Intercept) (x-axis) plotted against hlm.(Intercept), AND on the second panel, we have ols.ses (x-axis) plotted against hlm.ses?
library(lme4)
library(tidyverse)
hsb <- read.csv('https://raw.githubusercontent.com/rnorouzian/e/master/hsb.csv')
fit <- lmer(math~ses+(ses|sch.id), data= hsb)
ch <- unique(hsb$sch.id)
ols <- map_dfr(ch,~coef(lm(math~ses, data=hsb,subset=sch.id==.)))
mlm <- coef(fit)$sch
full <- cbind(ols=ols, hlm=mlm, sch.id=ch)
head(full, n = 1)
ols.(Intercept) ols.ses hlm.(Intercept) hlm.ses sch.id
1224 10.80513 2.508582 11.06002 2.504083 1224
One approach to achieve this is by making two separate plots and glue them together using e.g. patchwork:
library(lme4)
library(tidyverse)
library(patchwork)
hsb <- read.csv('https://raw.githubusercontent.com/rnorouzian/e/master/hsb.csv')
fit <- lmer(math~ses+(ses|sch.id), data= hsb)
ch <- unique(hsb$sch.id)
ols <- map_dfr(ch,~coef(lm(math~ses, data=hsb,subset=sch.id==.)))
mlm <- coef(fit)$sch
full <- cbind(ols=ols, mlm=mlm, sch.id=ch)
p1 <- ggplot(full, aes(`ols.(Intercept)`, `mlm.(Intercept)`)) +
geom_point()
p2 <- ggplot(full, aes(ols.ses, mlm.ses)) +
geom_point()
p1 + p2
And as a second approach with some data wrangling one can achieve a similar plot using facet_wrap:
library(lme4)
#> Loading required package: Matrix
library(tidyverse)
hsb <- read.csv('https://raw.githubusercontent.com/rnorouzian/e/master/hsb.csv')
fit <- lmer(math~ses+(ses|sch.id), data= hsb)
ch <- unique(hsb$sch.id)
ols <- map_dfr(ch,~coef(lm(math~ses, data=hsb,subset=sch.id==.)))
mlm <- coef(fit)$sch
full <- cbind(ols=ols, mlm=mlm, sch.id=ch)
full %>%
pivot_longer(- sch.id, names_to = "var", values_to = "value") %>%
separate(var, into = c("var1", "category"), sep = "\\.") %>%
pivot_wider(names_from = var1, values_from = value) %>%
ggplot(aes(ols, mlm)) +
geom_point() +
facet_wrap(~ category)
An option with facets. The solution from #stefan was really nice and quick. You could set an entire data pipeline by smartly separating your strings and then after reshaping you can have the desired variables in a format to be plotted using facet_wrap(). Here the code:
library(tidyverse)
#Plot
full %>% select(-sch.id) %>% pivot_longer(everything()) %>%
separate(name,c('V1','V2'),sep='\\.') %>%
arrange(V2,V1) %>%
group_by(V2,V1) %>% mutate(id=row_number()) %>%
pivot_wider(names_from = V1,values_from=value) %>% ungroup() %>%
select(-id) %>%
ggplot(aes(x=ols,y=mlm))+
geom_point()+
facet_wrap(.~V2,nrow = 1,scales = 'free')
Output:
Similar to the answer using patchwork, you can plot them as two separate ggplot() graphs and then put them side-by-side with the plot_grid() function from the cowplot package.
https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html

Ggplot - How to present the mean of a third varience?

Let's say I have this data frame:
The data frame
I want to make a graph which presents for each SES (Social Economy Status) what is the mean income for females and what is the mean income for males.
I have so far this code:
ggplot(incomeSorted, aes(GENDER)) +
scale_y_continuous("Mean")+
geom_bar(position = "dodge")+
facet_wrap("SES")
and this is the output:
How do I make the graph to present the mean of income instead of counting the number of females and males at each category?
Thanks ahead!
If you want to display mean income, you have to compute it. You can use dplyr and group_by() with summarise() to obtain the key variable and then plot. Here a code for the task:
library(ggplot2)
library(dplyr)
#Data
df <- data.frame(id=1:8,Gender=c(rep('Female',4),rep(c('Male','Female'),2)),
income=c(73,150,220.18,234,314.16,983.1,1001,1012),
SES=c('Bottom','Bottom','Middle','Middle','Middle',
'Upper','Upper','Upper'),
stringsAsFactors = F)
#Compute and plot
df %>% group_by(SES,Gender) %>%
summarise(MeanIncome=mean(income,na.rm=T)) %>%
ggplot(aes(x=Gender,y=MeanIncome)) +
scale_y_continuous("Mean")+
geom_bar(stat = 'identity')+
facet_wrap(.~SES)
Output:
Or you can avoid facets and displaying the plot with a fill variable like this:
#Code 2
df %>% group_by(SES,Gender) %>%
summarise(MeanIncome=mean(income,na.rm=T)) %>%
ggplot(aes(x=Gender,y=MeanIncome,fill=SES)) +
scale_y_continuous("Mean")+
geom_bar(stat = 'identity',position = position_dodge2(0.9,preserve = 'single'))
Output:

bicolor heatmap with factor levels

I have this dataframe:
set.seed(0)
df <- data.frame(id = factor(sample(1:100, 10000, replace=TRUE), levels=1:100),
year = factor(sample(1950:2019, 10000, replace=TRUE), levels=1950:2019)) %>% unique() %>% arrange(id, year)
And I'm looking to plot a heatmap graph where the ids are in the X-axis, years at the Y-axis, and the color is blue when the data point exists and the color is red when the data doesn't exist. I'm almost there, but I can't figure out to change the fill argument for the two colors:
ggplot(df, aes(id, year, fill= year)) +
geom_tile()
The objective to plot both variables as factors is to plot them even when some year doesn't have any id (and plotting its whole row as red).
EDIT:
Two things I forgot to add (hope it's not too late):
How to add alpha transparency to geom_tile() without messing it?
I need to sort the ids from maximum missings to minimum missings.
The complete() function from the tidyr package is useful for filling in missing combinations. First, you need to set a flag variable to indicate if the data is present or not, and then expand the data frame with the missing combinations and fill the new flag variable with 0:
df <- df %>%
mutate(flag = TRUE) %>%
complete(id, year, fill = list(flag = FALSE))
ggplot(df, aes(id, year, fill = flag)) +
geom_tile()
EDIT1: To add transparency, add alpha = 0.x within geom_tile(), where x is a value indicating the transparency. The lower the value, the more transparent.
EDIT2: To sort by missingness add the following code prior to the ggplot code:
# Determine the order of the IDs
df_order <- df %>%
group_by(id) %>%
summarize(sum = sum(flag)) %>%
arrange(desc(sum)) %>%
mutate(order = row_number()) %>%
select(id, order)
# Set the IDs in order on the chart
df <- df %>%
left_join(df_order) %>%
mutate(id = fct_reorder(id, order))
I think you need to do some pre-processing before plotting. Create a temporary variable (data_exist) which denotes data is present for that id and year. Then use complete to fill the missing years for each id and plot it.
library(tidyverse)
df %>%
mutate_all(~as.integer(as.character(.))) %>%
mutate(data_exist = 1) %>%
complete(id, year = min(year):max(year), fill = list(data_exist = 0)) %>%
mutate(data_exist = factor(data_exist)) %>%
ggplot() + aes(id, year, fill= data_exist) + geom_tile()
With expand.gridyou can create a dataframe with all combinations of ids and years, then left join on this combinations to see if you had them in df
all <- expand.grid(id=levels(df$id),year=levels(df$year)) %>%
left_join(df) %>%
mutate(present=ifelse(is.na(present),'0','1'))
ggplot(all, aes(as.numeric(id), as.numeric(year), fill= present)) +
geom_tile() +
scale_fill_manual(values=c('0'='red','1'='blue')) + # change default colors
theme(legend.position="None") # hide legend

Trying to filter rows by intervals and plotting number of rows obtained

Consider the column "disp" in mtcars. I am trying to divide disp into intervals so that I can count the number of observations in each interval. After doing this I want to plot the results as a ggplot geom_line
This is what I have tried:
library (tidyverse)
library (ggplot2)
a1 <- mtcars %>% arrange(desc(disp)) %>%
mutate(counts = cut_interval(disp, length = 5)) %>% group_by(counts) %>% mutate(nn = n())
a2 <- a1 %>% select(counts,nn) %>% unique()
ggplot(a2, aes(counts, nn)) +
geom_point(shape = 16, size = 1, show.legend = FALSE) +
theme_bw()
I get the intervals I need in a2. i can use it to plot a scatterplot but I can see that there is no proper scale. Is there any way to use these intervals to get a continuous scale and draw a lineplot of counts vs nn?
mtcars %>% ggplot(aes(x = disp)) + geom_histogram(binwidth = 1) + theme_bw()
Thanks so much Rui Barradas! I just needed a count plot so no need of doing extra stuff.

Using gganimate and ggplot for a boxplot: Cumulative not working

I'm trying to produce an animation for a simulation model, and I want to show how the distribution of results changes as the simulation runs.
I've seen gganimate used for scatter plots but not for boxplots (or ideally violin plots). Here I've provided a reprex.
When I use sim_category (which is a bucket for a certain number of simulation runs) I want the result to be cumulative of all previous runs to show the total distribution.
In this example (and my actual code), cumulative = TRUE does not do this. Why is this?
library(gganimate)
library(animation)
library(ggplot2)
df = as.data.frame(structure(list(ID = c(1,1,2,2,1,1,2,2,1,1,2,2),
value = c(10,15,5,10,7,17,4,12,9,20,6,17),
sim_category = c(1,1,1,1,2,2,2,2,3,3,3,3))))
df$ID <- factor(df$ID, levels = (unique(df$ID)))
df$sim_category <- factor(df$sim_category, levels = (unique(df$sim_category)))
ani.options(convert = shQuote('C:/Program Files/ImageMagick-7.0.5-Q16/magick.exe'))
p <- ggplot(df, aes(ID, value, frame= sim_category, cumulative = TRUE)) + geom_boxplot(position = "identity")
gganimate(p)
gganimate's cumulative doesn't accumulate the data, it just keeps gif frames in subsequent frames as they appear. To achieve what you want, you have to do the accumulation before building the plot, something along the following lines:
library(tidyverse)
library(gganimate)
df <- data_frame(
ID = factor(c(1,1,2,2,1,1,2,2,1,1,2,2), levels = 1:2),
value = c(10,15,5,10,7,17,4,12,9,20,6,17),
sim_category = factor(c(1,1,1,1,2,2,2,2,3,3,3,3), levels = 1:3)
)
p <- df %>%
pull(sim_category) %>%
levels() %>%
as.integer() %>%
map_df(~ df %>% filter(sim_category %in% 1:.x) %>% mutate(sim_category = .x)) %>%
ggplot(aes(ID, value, frame = factor(sim_category))) +
geom_boxplot(position = "identity")
gganimate(p)

Resources