Montecarlo Simulation using R (how to "translate" Stata codes for R) - r

I need to run the same exercise (Monte Carlo simulations) from Stata to R.
The codes I have used in Stata are the codes bellow. How can I do this using R? (I have searched for many tutorials, but I still didn't manage to do it in R).
* Simulations (10, 100 and 1000 sample replications/iterations)
clear
drop _all
set obs 100
set seed 54231
gen x = ((rnormal()))*10 + 40
* Generating true_y, considering Beta = 0,035
gen true_y = 5+0.03500*x
save truth, replace
twoway scatter true_y x
program hetero1
version 13
args c
use truth, clear
gen y = true_y + rnormal()
regress y x
end
foreach i in 10 100 1000 {
simulate _b, reps (`i'): hetero1
sum _b_x
twoway histogram _b_x, fraction xline(+0.03500, lcolor(red)) xline(`r(mean)', lcolor(green)) fcolor(none) lcolor(gs0) legend(off) title(`i' Repetições)
graph save graf`i'.gph, replace
}
gr combine graf10.gph graf100.gph graf1000.gph
graph export "graf.png", as(png) replace
At the end, I need to obtain these 3 histograms (of estimated beta/coefficients), considering 10, 100 and 1000 sample replications. The red line refers to the "true" coefficient and the green one is the mean of the estimated coefficients - [see the image in the link]

This should do it:
# Simulations (10, 100 and 1000 sample replications/iterations)
library(ggplot2)
library(dplyr)
library(gridExtra)
n <- 100
set.seed(54231)
x <- rnorm(n)*10 + 40
# Generating true_y, considering Beta = 0,035
true_y <- 5+0.03500*x
plot(x, true_y)
b <- t(replicate(1110, coef(lm(true_y + rnorm(n) ~ x))))
b <- as.data.frame(b) %>%
rename("a" = "(Intercept)",
"b" = "x") %>%
mutate(
obs = 1:n(),
n = case_when(
obs %in% 1:10 ~ "N = 10",
obs %in% 11:110 ~ "N = 100",
TRUE ~ "N = 1000"),
n = factor(n, levels=c("N = 10", "N = 100", "N = 1000")))
b10 <- b %>% filter(n == "N = 10")
g1 <- ggplot() +
geom_histogram(data = b10, aes(x=b), bins=3, col="white") +
geom_vline(xintercept = 0.03500, col="red") +
geom_vline(data = b10 %>% summarise(b=mean(b)), aes(xintercept = b), col="green") +
facet_wrap(~n) +
theme_bw()
b100 <- b %>% filter(n == "N = 100")
g2 <- ggplot() +
geom_histogram(data = b100, aes(x=b), bins=10, col="white") +
geom_vline(xintercept = 0.03500, col="red") +
geom_vline(data = b100 %>% summarise(b=mean(b)), aes(xintercept = b), col="green") +
facet_wrap(~n) +
theme_bw()
b1000 <- b %>% filter(n == "N = 1000")
g3 <- ggplot() +
geom_histogram(data = b1000, aes(x=b), bins=25, col="white") +
geom_vline(xintercept = 0.03500, col="red") +
geom_vline(data = b1000 %>% summarise(b=mean(b)), aes(xintercept = b), col="green") +
facet_wrap(~n) +
theme_bw()
library(gridExtra)
grid.arrange(g1, g2, g3, nrow=2)

Related

How to generate a facet_grid where each facet has different interaction terms in R ggplot2?

There are four species A-D in a community influenced by temperature and moisture. Species A has an interaction term between temperature and moisture, everything else has a simple linear relationship with moisture (as assumed). I want to create a ggplot where B,C, and D have linear relationships but A shows an interaction. As oultlined here I will need to choose either mositure or temperature for the x axis, and represent the other variable with another aesthetic such as color.
Here is the code for the species
library(lme4)
library(ggplot2)
library(geomtextpath)
library(lme4)
##Creating the DataFrame
set.seed(111)
mean.temp <- rnorm(100, 20,1)
mean.moisture <- rnorm(100,30,1)
y.var <- rnorm(100, 2,1)
site <- rep(c("1","2","3","4"), times = 25)
species <- rep(c("A","B","C","D"), each = 25)
df <- data.frame(mean.temp, mean.moisture, y.var, site, species)
df$site <- as.factor(as.character(df$site))
df$species <- as.factor(as.character(df$species))
##Assume only species A has singificant interactions terms
dfA <- df[df$species == "A",]
modA <- lmer(dfA$y.var ~ dfA$mean.temp * dfA$mean.moisture + (1|dfA$site), data = dfA)
dfA.mod <- expand.grid(mean.temp = seq(min(dfA$mean.temp), max(dfA$mean.temp), length = 4),
mean.moisture = seq(min(dfA$mean.moisture), max(dfA$mean.moisture),
length = length(dfA$species)))
dfA.mod$y.var <- predict(modA, newdata = dfA.mod)
##Assume all other species have a linear relationship
dfD <- df[df$species == "D",]
modD <- lmer(y.var ~ mean.moisture + (1|site), data = dfD)
dfD$y.var <- predict(modD)
dfC <- df[df$species == "C",]
modC <- lmer(y.var ~ mean.moisture + (1|site), data = dfC)
dfC$y.var <- predict(modC)
dfB <- df[df$species == "B",]
modB <- lmer(y.var ~ mean.moisture + (1|site), data = dfB)
dfB$y.var <- predict(modB)
#Merge A,B,C and get rid of site, species etc..
df.ABC <- rbind(dfB, dfC, dfD)
df.ABC <- df.ABC[,c(1:3)]
#Merge predictions from all four species
df.pred <- rbind(dfA.mod,df.ABC)
If I plotted A alone it would look like
ggplot(dfA.mod, aes(x = mean.moisture, y = y.var, group = mean.temp)) +
geom_point(data = dfA, aes(shape = site, color = mean.temp)) +
geom_textline(aes(color = mean.temp, label = round(mean.temp, 2)), hjust = 0.95) + scale_color_gradient(low = 'navy', high = 'red4') + theme_light(base_size = 16) + facet_grid(.~species)
I want to use some such similar code for all the species with facet_wrap. Here is my attempt
#Plot the raw data coded by site, and the predictions
ggplot(df.pred, aes(x = mean.moisture, y = y.var, group = mean.temp)) +
geom_point(data = df, aes(shape = site, color = mean.temp)) +
geom_textline(aes(color = mean.temp, label = round(mean.temp, 2)), hjust = 0.95) + scale_color_gradient(low = 'navy', high = 'red4') + theme_light(base_size = 16) + facet_grid(.~species)
It's a mush. How can I change these plots to represent only a single line in B to C, but 4 lines in plot A

Multiple geom_smooth at differing thresholds

I would like to make a plot that has multiple geom_smooth(method="loess") lines for differing thresholds, but I'm having some issues.
Specifically, I want a geom_smooth() line for the all points >1 standard deviation (SD) or < -1 SD (which includes -/+2SD), one for <-2SD and >2SD, and one with all the points together. However, I'm running into an issue where it is only doing the smooth for the data within each category (i.e. greater than 1 SD but less than 2 SD.
I have made some toy data here:
#test data
a <- c(rnorm(10000, mean=0, sd = 1))
b <- c(rnorm(10000, mean=0, sd = 1))
test <- as.data.frame(cbind(a,b))
test3$Thresholds <- cut(test$a, breaks = c(-Inf,-2*sd(test$a),-sd(test$a),0,sd(test$a), 2*sd(test$a), Inf),
labels = c("2_SD+", "1_SD", "0_SD","0_SD", "1_SD", "2_SD+"))
plot <- ggplot(test3, aes(x=b, y=a, color=Thresholds, alpha = 0.25, legend = F)) + geom_point() + geom_smooth(method="loess")
This creates the following plot:
Does anyone have any suggestions?
If you want smoothing done for different quantities of x and y you have to manipulate the data component...
library(ggplot2)
library(dplyr)
#test data
a <- c(rnorm(10000, mean=0, sd = 1))
b <- c(rnorm(10000, mean=0, sd = 1))
test <- as.data.frame(cbind(a,b))
test$Thresholds <- cut(test$a, breaks = c(-Inf,-2*sd(test$a),-sd(test$a),0,sd(test$a), 2*sd(test$a), Inf),
labels = c("2_SD+", "1_SD", "0_SD","0_SD", "1_SD", "2_SD+"))
ggplot(test, aes(x=b, y=a)) +
geom_point() +
# just 2
geom_smooth(data = test %>% filter(Thresholds == "2_SD+"), method="loess") +
# 1 and 2
geom_smooth(data = test %>% filter(Thresholds == "1_SD" | Thresholds == "2_SD+" ), method="loess", color = "yellow") +
#all
geom_smooth(data = test, method="loess", color = "red")
#> `geom_smooth()` using formula 'y ~ x'
#> `geom_smooth()` using formula 'y ~ x'
#> `geom_smooth()` using formula 'y ~ x'

Change scale in geom_qq

I'd like to get the numeric values of a variable (rather than z-score) in the x-axis using ggplot and geom_qq
library("ggplot2")
coin_prob <- 0.5 # this is a fair coin
tosses_per_test <- 5000 # we want to flip a coin 5000 times
no_of_tests <- 1000
outcomes <- rbinom(n = no_of_tests,
size = tosses_per_test,
prob = coin_prob)/tosses_per_test
outcomes.df <- data.frame("results"= outcomes)
ggplot(outcomes.df, aes(sample = results)) +
geom_qq() +
geom_qq_line(color="red") +
labs(x="Theoretical Data", title = "Simulated Coin toss", subtitle = "5000 tosses repeated 1000 times", y="Sample Outcomes")
The default in ggplot for the x-axis seems to be z-scores rather than raw theoretical values. I can hack around like this to get the "real" x axis
p <- ggplot(outcomes.df, aes(sample = results)) + geom_qq()
g <- ggplot_build(p)
raw_qs <- g$data[[1]]$theoretical*sd(outcomes.df$results) + mean(outcomes.df$results)
ggplot(outcomes.df, aes(sample = results)) +
geom_qq() +
geom_qq_line(color="red") +
labs(x="Theoretical Data", title = "Simulated Coin toss", subtitle = "5000 tosses repeated 1000 times", y="Sample Outcomes") +
scale_x_continuous(breaks=seq(-3,3,1), labels = round((seq(-3,3,1)*sd(outcomes.df$results) + mean(outcomes.df$results)),2))
But there's got to be something simpler
Set the parameters of the distribution such that the theoretical quantiles match the distribution to which you're comparing.
library("ggplot2")
coin_prob <- 0.5 # this is a fair coin
tosses_per_test <- 5000 # we want to flip a coin 5000 times
no_of_tests <- 1000
outcomes <- rbinom(
n = no_of_tests,
size = tosses_per_test,
prob = coin_prob) / tosses_per_test
## set dparams in _qq calls
## so that we're not comparing against standard normal distn.
ggplot(mapping = aes(sample = outcomes)) +
geom_qq(dparams = list(mean = mean(outcomes), sd = sd(outcomes))) +
geom_qq_line(
dparams = list(mean = mean(outcomes), sd = sd(outcomes)),
color = "red"
) +
labs(
x = "Theoretical Data",
title = "Simulated Coin toss",
subtitle = "5000 tosses repeated 1000 times",
y = "Sample Outcomes"
)
You can also change the distribution entirely.
For example, to compare against uniform quantiles (eg, p-values)
pvals <- replicate(1000, cor.test(rnorm(100), rnorm(100))$p.value)
ggplot(mapping = aes(sample = pvals)) +
geom_qq(distribution = stats::qunif) +
geom_qq_line(
distribution = stats::qunif,
color = "red"
) +
labs(
x = "Uniform quantiles",
title = "p-values under the null",
subtitle = "1,000 null correlation tests",
y = "Observed p-value"
)

Underscore plot in R

Introduction and Current Work Done
[Note: For those interested, I have provided code at the end for reproducing my example.]
I have some data and I have conducted an ANOVA analysis and obtained Tukey's pairwise comparisons:
model1 = aov(trt ~ grp, data = df)
anova(model1)
> TukeyHSD(model1)
diff lwr upr p adj
B-A 0.03481504 -0.40533118 0.4749613 0.9968007
C-A 0.36140489 -0.07874134 0.8015511 0.1448379
D-A 1.53825179 1.09810556 1.9783980 0.0000000
C-B 0.32658985 -0.11355638 0.7667361 0.2166301
D-B 1.50343674 1.06329052 1.9435830 0.0000000
D-C 1.17684690 0.73670067 1.6169931 0.0000000
I can also plot Tukey's pairwise comparisons
> plot(TukeyHSD(model1))
We can see from Tukey's confidence intervals and the plot that A-B, B-C and A-C are not significantly different.
Problem
I have been asked to create something called an "underscore plot" which is described as follows:
We plot the group means on the real line and we draw a line segment between group means to indicate that there is no significant difference between those two particular groups.
Obtaining the means is not difficult:
> aggregate(df$trt ~ df$grp, FUN = mean)
df$grp df$trt
1 A 2.032086
2 B 2.066901
3 C 2.393491
4 D 3.570338
Desired Output
Using the data in this example, the desired plot should appear like the one below:
There is a line segment between the groups that are not significantly different (i.e. a line segment between A-B, B-C and A-C as indicated by Tukey's).
Note: Please note that the plot above is not to scale and it was created in keynote for illustrative purposes only.
Is there a way to get the "underscore plot" described above using R (using either base R or a library such as ggplot2)?
Edit
Here is the code that I used to create the example above:
library(data.table)
set.seed(3)
A = runif(20, 1,3)
A = data.frame(A, rep("A", length(A)))
B = runif(20, 1.25,3.25)
B = data.frame(B, rep("B", length(B)))
C = runif(20, 1.5,3.5)
C = data.frame(C, rep("C", length(C)))
D = runif(20, 2.75,4.25)
D = data.frame(D, rep("D", length(D)))
df = list(A, B, C, D)
df = rbindlist(df)
colnames(df) = c("trt", "grp")
Here's a ggplot version of the underscore plot. We'll load the tidyverse package, which loads ggplot2, dplyr and a few other packages from the tidyverse. We create a data frame of coefficients to plot the group names, coefficient values, and vertical segments and a data frame of non-significant pairs for generating the horizontal underscores.
library(tidyverse)
model1 = aov(trt ~ grp, data=df)
# Get coefficients and label coefficients with names of levels
coefs = coef(model1)
coefs[2:4] = coefs[2:4] + coefs[1]
names(coefs) = levels(model1$model$grp)
# Get non-significant pairs
pairs = TukeyHSD(model1)$grp %>%
as.data.frame() %>%
rownames_to_column(var="pair") %>%
# Keep only non-significant pairs
filter(`p adj` > 0.05) %>%
# Add coefficients to TukeyHSD results
separate(pair, c("pair1","pair2"), sep="-", remove=FALSE) %>%
mutate(start = coefs[match(pair1, names(coefs))],
end = coefs[match(pair2, names(coefs))]) %>%
# Stagger vertical positions of segments
mutate(ypos = seq(-0.03, -0.04, length=3))
# Turn coefs into a data frame
coefs = enframe(coefs, name="grp", value="coef")
ggplot(coefs, aes(x=coef)) +
geom_hline(yintercept=0) +
geom_segment(aes(x=coef, xend=coef), y=0.008, yend=-0.008, colour="blue") +
geom_text(aes(label=grp, y=0.011), size=4, vjust=0) +
geom_text(aes(label=sprintf("%1.2f", coef)), y=-0.01, size=3, angle=-90, hjust=0) +
geom_segment(data=pairs, aes(group=pair, x=start, xend=end, y=ypos, yend=ypos),
colour="red", size=1) +
scale_y_continuous(limits=c(-0.05,0.04)) +
theme_void()
Base R
d1 = data.frame(TukeyHSD(model1)[[1]])
inds = which(sign(d1$lwr) * (d1$upr) <= 0)
non_sig = lapply(strsplit(row.names(d1)[inds], "-"), sort)
d2 = aggregate(df$trt ~ df$grp, FUN=mean)
graphics.off()
windows(width = 400, height = 200)
par("mai" = c(0.2, 0.2, 0.2, 0.2))
plot(d2$`df$trt`, rep(1, NROW(d2)),
xlim = c(min(d2$`df$trt`) - 0.1, max(d2$`df$trt`) + 0.1), lwd = 2,
type = "l",
ann = FALSE, axes = FALSE)
segments(x0 = d2$`df$trt`,
y0 = rep(0.9, NROW(d2)),
x1 = d2$`df$trt`,
y1 = rep(1.1, NROW(d2)),
lwd = 2)
text(x = d2$`df$trt`, y = rep(0.8, NROW(d2)), labels = round(d2$`df$trt`, 2), srt = 90)
text(x = d2$`df$trt`, y = rep(0.75, NROW(d2)), labels = d2$`df$grp`)
lapply(seq_along(non_sig), function(i){
lines(cbind(d2$`df$trt`[match(non_sig[[i]], d2$`df$grp`)], rep(0.9 - 0.01 * i, 2)))
})

ggplotly: how to return dates with stat_smooth?

I am trying to plot cumulative sums of groups over time, and add corresponding linear prediction lines for each group. The plot turns out well, however, I cannot read the dates of the prediction slopes on the x-axis as these are numbers.
How can I change the code below so that stat_smooth returns dates instead of numbers?
g <- ggplot(aes(x = created_at_day, y = sum, color = groups), data= df) +
geom_line() +
stat_smooth(aes(x = as.Date(created_at_day), y = sum, color = groups), method = "lm", fullrange = TRUE, se = FALSE, size = 0.1) +
xlab('Date') +
ylab('cumulative sum of patients') +
expand_limits(x = as.Date(c("2017-11-13", "2018-04-01"))) +
ggtitle('Number of patients included per practice') +
theme_bw()
g
ggplotly(g)
EDIT: Here is some code to generate the plot above.
set.seed(101)
created_at_day= sample(seq(as.Date('2017-01-01'), as.Date('2018-01-01'), by = "day"), 300)
set.seed(101)
groups= sample(1:3,300, replace= TRUE)
df = data.frame(created_at_day, groups)
group1= seq(1, table(groups)[[1]]*1.5, 1.5)
group2= seq(0.8, table(groups)[[2]]*0.8, 0.8)
group3= seq(1, table(groups)[[3]]*1.8, 1.8)
df_g1= subset(df, groups == 1)
df_g1 = arrange(df_g1, created_at_day)
df_g1= cbind(df_g1, sum= group1)
df_g2= subset(df, groups == 2)
df_g2 = arrange(df_g2, created_at_day)
df_g2= cbind(df_g2, sum= group2)
df_g3= subset(df, groups == 3)
df_g3 = arrange(df_g3, created_at_day)
df_g3= cbind(df_g3, sum= group3)
df= rbind(df_g1, df_g2, df_g3)
df$groups= as.factor(df$groups)

Resources