Simulation with Tidyverse -- putting data into tibble format - r

I am trying to run simulations in R using the tidyverse. This code works, but doesn't scale well to more than a few variables.
Any thoughts on how to improve this? I've tried purrr but I didn't find any success.
The example below draws 5 values from a normal distribution and repeats this 3 times. How could I repeat it n times instead of 3?
n = 5
x=1:n
y1 = rnorm(n)
y2 = rnorm(n)
y3 = rnorm(n)
# put data into tibble
df <- tibble(x=x, y1=y1, y2=y2, y3=y3)
# Tidy data -- go from wide to long
df <- pivot_longer(df, cols=starts_with('y'))
# Make plot
ggplot(df, aes(x=x, y=value, group=name, color=name))+
geom_line()

If we need to replicate, then
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
n <- 5
rpl <- 3
replicate(rpl, rnorm(n), simplify = FALSE) %>%
set_names(str_c('y', seq_along(.))) %>%
as_tibble %>%
mutate(x = row_number()) %>%
pivot_longer(cols = starts_with('y')) %>%
ggplot(aes(x=x, y=value, group=name, color=name))+
geom_line()

Related

two-panel scatter plot in ggplot2

For my data.frame full below, I'm wondering how to create a two-panel geom_point such that on the first panel, we have ols.(Intercept) (x-axis) plotted against hlm.(Intercept), AND on the second panel, we have ols.ses (x-axis) plotted against hlm.ses?
library(lme4)
library(tidyverse)
hsb <- read.csv('https://raw.githubusercontent.com/rnorouzian/e/master/hsb.csv')
fit <- lmer(math~ses+(ses|sch.id), data= hsb)
ch <- unique(hsb$sch.id)
ols <- map_dfr(ch,~coef(lm(math~ses, data=hsb,subset=sch.id==.)))
mlm <- coef(fit)$sch
full <- cbind(ols=ols, hlm=mlm, sch.id=ch)
head(full, n = 1)
ols.(Intercept) ols.ses hlm.(Intercept) hlm.ses sch.id
1224 10.80513 2.508582 11.06002 2.504083 1224
One approach to achieve this is by making two separate plots and glue them together using e.g. patchwork:
library(lme4)
library(tidyverse)
library(patchwork)
hsb <- read.csv('https://raw.githubusercontent.com/rnorouzian/e/master/hsb.csv')
fit <- lmer(math~ses+(ses|sch.id), data= hsb)
ch <- unique(hsb$sch.id)
ols <- map_dfr(ch,~coef(lm(math~ses, data=hsb,subset=sch.id==.)))
mlm <- coef(fit)$sch
full <- cbind(ols=ols, mlm=mlm, sch.id=ch)
p1 <- ggplot(full, aes(`ols.(Intercept)`, `mlm.(Intercept)`)) +
geom_point()
p2 <- ggplot(full, aes(ols.ses, mlm.ses)) +
geom_point()
p1 + p2
And as a second approach with some data wrangling one can achieve a similar plot using facet_wrap:
library(lme4)
#> Loading required package: Matrix
library(tidyverse)
hsb <- read.csv('https://raw.githubusercontent.com/rnorouzian/e/master/hsb.csv')
fit <- lmer(math~ses+(ses|sch.id), data= hsb)
ch <- unique(hsb$sch.id)
ols <- map_dfr(ch,~coef(lm(math~ses, data=hsb,subset=sch.id==.)))
mlm <- coef(fit)$sch
full <- cbind(ols=ols, mlm=mlm, sch.id=ch)
full %>%
pivot_longer(- sch.id, names_to = "var", values_to = "value") %>%
separate(var, into = c("var1", "category"), sep = "\\.") %>%
pivot_wider(names_from = var1, values_from = value) %>%
ggplot(aes(ols, mlm)) +
geom_point() +
facet_wrap(~ category)
An option with facets. The solution from #stefan was really nice and quick. You could set an entire data pipeline by smartly separating your strings and then after reshaping you can have the desired variables in a format to be plotted using facet_wrap(). Here the code:
library(tidyverse)
#Plot
full %>% select(-sch.id) %>% pivot_longer(everything()) %>%
separate(name,c('V1','V2'),sep='\\.') %>%
arrange(V2,V1) %>%
group_by(V2,V1) %>% mutate(id=row_number()) %>%
pivot_wider(names_from = V1,values_from=value) %>% ungroup() %>%
select(-id) %>%
ggplot(aes(x=ols,y=mlm))+
geom_point()+
facet_wrap(.~V2,nrow = 1,scales = 'free')
Output:
Similar to the answer using patchwork, you can plot them as two separate ggplot() graphs and then put them side-by-side with the plot_grid() function from the cowplot package.
https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html

Using geom_smooth for fitting a glm to fractions

This post is somewhat related to this post.
Here I have xy grouped data where y are fractions:
library(dplyr)
library(ggplot2)
library(ggpmisc)
set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
age = rep(c("d2","d4","d45"),3),
group = c("A","A","A","B","B","B","C","C","C")) %>%
dplyr::mutate(time = as.integer(age)) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))
What I'm trying to achieve is to plot df1 as a bar plot, like this:
ggplot(df1,aes(x=group_age,y=value,fill=age)) +
geom_bar(stat='identity')
But I want to fit to each group a binomial glm with a logit link function, which estimates how these fractions are affected by time.
Let's say I have 100 observations per each age (time) in each group:
df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(T,100*df1$value[i]),rep(F,100*(1-df1$value[i]))))
}))
Then the glm for each group (e.g., group A) is:
glm(value ~ time, dplyr::filter(df2, group == "A"), family = binomial(link='logit'))
So I would like to add to the plot above the estimated regression slopes for each group along with their corresponding p-values (similar to what I'm doing for the continuous df$value in this post).
I thought that using:
ggplot(df1,aes(x=group_age,y=value,fill=age)) +
geom_bar(stat='identity') +
geom_smooth(data=df2,mapping=aes(x=group_age,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit')),size=1,se=T) +
stat_poly_eq(aes(label=stat(p.value.label)),formula=my_formula,parse=T,npcx="center",npcy="bottom") +
scale_x_log10(name="Age",labels=levels(df$age),breaks=1:length(levels(df$age))) +
facet_wrap(~group) + theme_minimal()
Would work but I get the error:
Error in Math.factor(x, base) : ‘log’ not meaningful for factors
Any idea how to get it right?
I believe this could help:
library(tidyverse)
library(broom)
df2$value <- as.numeric(df2$value)
#Estimate coefs
dfmodel <- df2 %>% group_by(group) %>%
do(fitmodel = glm(value ~ time, data = .,family = binomial(link='logit')))
#Extract coeffs
dfCoef = tidy(dfmodel, fitmodel)
#Create labels
dfCoef %>% filter(term=='(Intercept)') %>% mutate(Label=paste0(round(estimate,3),'(p=',round(p.value,3),')'),
group_age=paste0(group,'_','d4')) %>%
select(c(group,Label,group_age)) -> Labels
#Values
df2 %>% group_by(group,group_age) %>% summarise(value=sum(value)) %>% ungroup() %>%
group_by(group) %>% filter(value==max(value)) %>% select(-group_age) -> values
#Combine
Labels %>% left_join(values) -> Labels
Labels %>% mutate(age=NA) -> Labels
#Plot
ggplot(df2,aes(x=group_age,y=value,fill=age)) +
geom_text(data=Labels,aes(x=group_age,y=value,label=Label),fontface='bold')+
geom_bar(stat='identity')+
facet_wrap(.~group,scales='free')
Thanks to Pedro Aphalo this is nearly a complete solution:
Generate the data.frame with the fractions (here use time as an integer by deleting "d" in age rather than using time as the levels of age):
library(dplyr)
library(ggplot2)
library(ggpmisc)
set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
age = rep(c("d2","d4","d45"),3),
group = c("A","A","A","B","B","B","C","C","C")) %>%
dplyr::mutate(time = as.integer(gsub("d","",age))) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))
Inflate df1 to 100 observations per each age in each group but specify value as an integer rather than a binary:
df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(1,100*df1$value[i]),rep(0,100*(1-df1$value[i]))))
}))
And now plot it using geom_smooth and stat_fit_tidy:
ggplot(df1,aes(x=time,y=value,group=group,fill=age)) +
geom_bar(stat='identity') +
geom_smooth(data=df2,mapping=aes(x=time,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit'))) +
stat_fit_tidy(data=df2,mapping=aes(x=time,y=value,group=group,label=sprintf("P = %.3g",stat(x_p.value))),method='glm',method.args=list(formula=y~x,family=binomial(link='logit')),parse=T,label.x="center",label.y="top") +
scale_x_log10(name="Age",labels=levels(df2$age),breaks=unique(df2$time)) +
facet_wrap(~group) + theme_minimal()
Which gives (note that the scale_x_log10 is mainly a cosmetic approach to presenting the x-axis as time rather than levels of age):
The only imperfection is that the p-values seem to appear messed up.

Adding character values of a column in R

I have two columns i.e. square_id & Smart_Nsmart as given below.
I want to count(add) N's and S's against each square_id and ggplot the data i.e. plot square_id vs Smart_Nsmart.
square_id 1
1
2 2 2 2 3 3 3 3
Smart_Nsmart
S N N N S S N S S S
We can use count and then use ggplot to plot the frequency. Here, we are plotting it with geom_bar (as it is not clear from the OP's post)
library(dplyr)
library(ggplot2)
df %>%
count(square_id, Smart_Nsmart) %>%
ggplot(., aes(x= square_id, y = n, fill = Smart_Nsmart)) +
geom_bar(stat = 'identity')
The above answer is very smart. However, instead of count function, you can implement group_by and summarise just in case in future you want to apply some other functions to your code.
library(dplyr)
library(ggplot2)
dff <- data.frame(a=c(1,1,1,1,2,1,2),b=c("C","C","N","N","N","C","N"))
dff %>%
group_by(a,b) %>%
summarise(n = length(b) ) %>%
ggplot(., aes(x= a, y = n, fill = b)) +
geom_bar(stat = 'identity')

Two Variable side by side bar plot ggplot of categorical data

demo <- read.table(header = TRUE,
text ="var1 Var2
Good Excellent
Subpar Good
Excellent Decent
Good Good
Subpar Subpar")
How would I create a side by side bar plot these Var1 and Var2 where the Y-axis is the count of each of each distinct values?
For instance a bar under good comparing the number of good in var1 to var2?
The tidyverse is perfect for that:
library(tidyverse)
demo %>%
gather(key, value) %>%
mutate(value_ordered = factor(value, levels=c("Decent","Good", "Subpar", "Excellent"))) %>%
ggplot(aes(value_ordered, fill=key)) +
geom_bar(position="dodge")
Or bars with same width:
as.tbl(demo) %>%
gather(key, value) %>%
group_by(key, value) %>% # group
count() %>% # count the frequency
ungroup() %>% # ungroup
complete(key, value) %>% # Complete missing combinations
mutate(value_ordered = factor(value, levels=c("Decent","Good", "Subpar", "Excellent"))) %>%
ggplot(aes(value_ordered,n, fill=key)) +
geom_col(position = "dodge") # it is recommended to use geom_col directly instead of stat="identity"
library(reshape)
library(ggplot2)
#sample data
demo <- read.table(header = TRUE,
text ="var1 var2
Good Excellent
Subpar Good
Excellent Decent
Good Good
Subpar Subpar")
#pre-processing
df <- merge(melt(table(demo$var1)), melt(table(demo$var2)), by='Var.1', all=T)
colnames(df) <- c("Words", "Var1", "Var2")
df[is.na(df)] <- 0
df <- melt(df, id=c("Words"))
colnames(df) <- c("Words", "Column_Name", "Count")
#plot
ggplot(df, aes(x=Words, y=Count, fill=Column_Name)) +
geom_bar(position="dodge", stat="identity")

Plot MNIST digits with ggplot2

I want to plot the MNIST digits using ggplot2.
I tried this but I'm getting the numbers rotated 90 degrees. The code below is to plot the 2nd number in the dataset which corresponds to a 2.
trainData = read.csv(file = url("https://drive.google.com/uc?export=download&id=0B4Tqe9kUUfrBSllGY29pWmdGQUE"))
df = expand.grid(y = 0:27, x = 0:27)
df$col = unlist(trainData[2, -c(1,2)])
ggplot(df, aes(x, y)) + geom_tile(aes(fill = col))
If possible, please consider in your solution that I plan expand this to plotting a matrix of numbers using facet_grid or facet_wrap. I want to end with a function that I will pass a vector of rows and the function will get those rows from the dataset and create a matrix of plots (one for each number).
Thanks!
mnist is a build-in dataset in keras package.
Here is one example plot with ggplot2 and tidyverse functions:
To make geom_tile work, we need to transform the data a bit.
library(keras)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
mnist <- keras::dataset_mnist()
mnist$test$x[sample(1:100,1), 1:28, 1:28] %>%
as_data_frame() %>%
rownames_to_column(var = 'y') %>%
gather(x, val, V1:V28) %>%
mutate(x = str_replace(x, 'V', '')) %>%
mutate(x = as.numeric(x),
y = as.numeric(y)) %>%
mutate(y = 28-y) %>%
ggplot(aes(x, y))+
geom_tile(aes(fill = val+1))+
coord_fixed()+
theme_void()+
theme(legend.position="none")

Resources