Plotting from a for loop - r

set.seed(666999)
PLOT <- function(n){
n_samples<-10
#Creating matrix to store generated data
data_matrix<-matrix(ncol =n_samples, nrow= n)
for(j in 1:n_samples){
#Generating 10 standard normal samples of size n
data_matrix[ , j]<-rnorm(n=n, mean = 0, sd = 1)
}
for(k in 1:n_samples){
sam_20<-data_matrix[ , k]
# extracting each of the kth samples
#Ploting
Q_Qplot<-qqnorm( sam_20 )
Q_QplotL<-qqline( sam_20 )
#Q_Qplot<-gg_qqplot(sam_20, ylab="Sample Quantiles",
#xlab = "Theoritical Quantiles",
#main= bquote("Q-Q plot for sample size of "
# ~ n == ~ .(n)))#quantile-quantile plot
}
# return(n)
#return(Q_Qplot)
#return(Q_QplotL)
}
#layout_matrix_1 <- matrix(1:10, ncol = 5) # Define position matrix
#layout(layout_matrix_1)
PLOT(100)
PLOT(50)
PLOT(20)
PLOT(10)
For each sample size, I am generating ten plots. I want the layout for each of my 10 plots to fit well on a page with reasonable height and width. For some reason, I can't get it to work the way I wanted. I need some assistance perhaps with ggplot2 which has a better aesthetic view. Thanks

An approach with tidyverse
library(tidyverse)
#Values to simulate
sample_size <- c(10,20,50,100)
sample_num <- 1:10
#Create data.frame
expand_grid(sample_size,sample_num) %>%
#Map values to simulate cenarios
mutate(y = map(sample_size,sim_data)) %>%
unnest(cols = c(y)) %>%
ggplot(aes(sample = y))+
#qqplot
stat_qq()+
#qqline with color blue
stat_qq_line(col = "blue")+
#facet sample_size x sample_num
facet_grid(cols = vars(sample_size),rows = vars(sample_num))

Related

Standalone legend in ggpairs

How can I include a legend inside one of the empty panels of the following matrix plot?
I have color coded different regression lines in the plots. I need a legend based on color.
I believe this answer comes closest to answer my question, yet I do not know how exactly to modify my code to get a legend based on color for different regression lines.
As for the background of the code, I am trying to study different robust and non-robust regression methods applied to multivariate data with and without outliers.
library(ggplot2)
library(GGally)
library(MASS)
library(robustbase)
## Just create data -- you can safely SKIP this function.
##
## Take in number of input variables (k), vector of ranges of k inputs
## ranges = c(min1, max1, min2, max2, ...) (must have 2k elements),
## parameters to create data (must be consistent with the number of
## input variables plus one), parameters are vector of linear
## coefficients (b) and random seed (seed), number of observations
## (n), vector of outliers (outliers)
##
## Return uncontaminated dataframe and contaminated dataframe
create_data <- function(k, ranges, b, seed = 6, n,
outliers = NULL) {
x <- NULL # x: matrix of input variables
for (i in 1:k) {
set.seed(seed^i)
## x <- cbind(x, runif(n, ranges[2*i-1], ranges[2*i]))
x <- cbind(x, rnorm(n, ranges[2*i-1], ranges[2*i]))
}
set.seed(seed - 2)
x_aug = cbind(rep(1, n), x)
y <- x_aug %*% b
y_mean = mean(y)
e <- rnorm(n, 0, 0.20 * y_mean) # rnorm x
y <- y + e
df <- data.frame(x = x, y = y)
len <- length(outliers)
n_rows <- len %/% (k+1)
if (!is.null(outliers)) {
outliers <- matrix(outliers, n_rows, k+1, byrow = TRUE)
df_contamin <- data.frame(x = rbind(x, outliers[,1:k]), y = c(y, outliers[,k+1]))
} else {
df_contamin <- df
}
dat <- list(df, df_contamin)
}
# plot different regression models (some are robust) for two types of
# data (one is contaminated with outliers)
plot_models <- function(data, mapping, data2) {
cb_palette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
## 1.grey, 2.light orange, 3.light blue, 4.green, 5.yellow, 6.blue, 7.red, 8.purple
plt <- ggplot(data = data, mapping = mapping) +
geom_point() +
theme_bw() +
geom_smooth(method = lm, formula = y ~ x, data = data2, color = cb_palette[3], se = FALSE) +
geom_smooth(method = lm, formula = y ~ x, color = cb_palette[7], se = FALSE) +
geom_smooth(method = rlm, formula = y ~ x, color = cb_palette[4], se = FALSE) +
geom_smooth(method = lmrob, formula = y ~ x, color = cb_palette[1], se = FALSE)
plt
}
# trim the upper and right panels of plots
trim_gg <- function(gg) {
n <- gg$nrow
gg$nrow <- gg$ncol <- n-1
v <- 1:n^2
gg$plots <- gg$plots[v > n & v%%n != 0]
gg$xAxisLabels <- gg$xAxisLabels[-n]
gg$yAxisLabels <- gg$yAxisLabels[-1]
gg
}
dat <- create_data(3, c(1, 10, 1, 10, 1, 10), c(5, 8, 6, 7), 6, 20, c(30, 30, 50, 400))
df <- dat[[1]]
df_contamin <- dat[[2]]
## Note that plot_models is called here
g <- ggpairs(df_contamin, columns = 1:4, lower = list(continuous = wrap(plot_models, data2 = df)), diag = list(continuous = "blankDiag"), upper = list(continuous = "blank")) #, legend = lgd)
gr <- trim_gg(g)
print(gr)
Created on 2019-10-09 by the reprex package (v0.3.0)
Sorry for the long code, but most probably only the plot_models function and the line where ggpairs is called need to be modified.
I want to get a legend in the blank upper half of the plots. It may be done by somehow tweaking the plot_models function, setting the mapping in ggpairs to color using ggplot2::aes_string, and using getPlot and putPlot of the GGally package. But I can't wrap my head around how to do it exactly.

How to get ggplot2 to draw multiple simulated trajectories in same plot?

I want to draw multiple simulated paths from any distribution (lognormal in the present case) on the same plot using ggplot2?
Using print(ggplot()) inside a for- loop does not show the paths all together.
library(ggplot2)
t <- 1000 # length of a simulation
time <- seq(0,t-1,by = 1) # make vector of time points
s <- cumsum(rlnorm(t, meanlog = 0, sdlog = 1)) # simulate trajectory of lognormal variable
df <- data.frame(cbind(time,s)) # make dataframe
colnames(df) <- c("t","s") # colnames
ggplot(df, aes(t,s )) + geom_line() # Get one trajectory
Now i want (say) 100 such paths in the same plot;
nsim <- 100 # number of paths
for (i in seq(1,nsim, by =1)) {
s <- cumsum(rlnorm(t, meanlog = 0, sdlog = 1))
df <- data.frame(cbind(time,s))
colnames(df) <- c("t","s")
print(ggplot(df, aes(t,s, color = i)) + geom_line())
}
The above loop obviously cannot do the job.
Any way to visualize such simulations using simple R with ggplot?
Instead of adding each line iteratively, you could iteratively simulate in a loop, collect all results in a data.frame, and plot all lines at once.
library(ggplot2)
nsim <- 100
npoints <- 1000
sims <- lapply(seq_len(nsim), function(i) {
data.frame(x = seq_len(npoints),
y = cumsum(rlnorm(npoints, meanlog = 0, sdlog = 1)),
iteration = i)
})
sims <- do.call(rbind, sims)
ggplot(sims, aes(x, y, colour = iteration, group = iteration)) +
geom_line()
Created on 2019-08-13 by the reprex package (v0.3.0)
In ggplot one method to achieve such methods is to add extra layers to the plot at each iteration. Doing so, a simple change of the latter code should be sufficient.
library(ggplot2)
nsim <- 100 # number of paths
dat <- vector("list", nsim)
p <- ggplot()
t <- 1000 # length of a simulation
time <- seq(0, t-1, by = 1)
for (i in seq(nsim)) {
s <- cumsum(rlnorm(t, meanlog = 0, sdlog = 1))
dat[[i]] <- data.frame(t = time, s = s)
p <- p + geom_line(data = dat[[i]], mapping = aes(x = t, y = s), col = i)
}
p #or print(p)
Note how I initiate the plot, similarly to how I initiate a list to contain the data frames prior to the loop. The loop then builds the plot step by step, but it is not visualized before i print the plot after the for loop. At which point every layer is evaluated (thus it can take a bit longer than standard R plots.)
Additionally as I want to specify the colour for each specific line, the col argument has to be moved outside the aes.

How to plot vector of bootstrapped slopes in ggplot2?

I've been using ggplot2 to plot the results of bootstrapping various statistical outputs such as correlation coefficients. Most recently, I bootstrapped the slope of a linear regression model. Here's how that looks using the plot() function from the graphics package:
plot(main="Relationship Between Eruption Length at Wait Time at \n
Old Faithful With Bootstrapped Regression Lines",
xlab = "Eruption Length (minutes)",
ylab = "Wait Time (minutes)",
waiting ~ eruptions,
data = faithful,
col = spot_color,
pch = 19)
index <- 1:nrow(faithful)
for (i in 1:10000) {
index_boot <- sample(index, replace = TRUE) #getting a boostrap sample (of indices)
faithful_boot <- faithful[index_boot, ]
# Fitting the linear model to the bootstrapped data:
fit.boot <- lm(waiting ~ eruptions, data = faithful_boot)
abline(fit.boot, lwd = 0.1, col = rgb(0, 0.1, 0.25, alpha = 0.05)) # Add line to plot
}
fit <- lm(waiting ~ eruptions, data=faithful)
abline(fit, lwd = 2.5, col = "blue")
That works, but it depends on a workflow where we first create a plot, then add the lines in a loop. I'd rather create a list of slopes with a function and then plot all of them in ggplot2.
For example, the function might look something like this:
set.seed(777) # included so the following output is reproducible
n_resample <- 10000 # set the number of times to resample the data
# First argument is the data; second is the number of resampled datasets
bootstrap <- function(df, n_resample) {
slope_resample <- matrix(NA, nrow = n_resample) # initialize vector
index <- 1:nrow(df) # create an index for supplied table
for (i in 1:n_resample) {
index_boot <- sample(index, replace = TRUE) # sample row numbers, with replacement
df_boot <- df[index_boot, ] # create a bootstrap sample from original data
a <- lm(waiting ~ eruptions, data=df_boot) # compute linear model
slope_resample[i] <- slope <- a$coefficients[2] # take the slope
}
return(slope_resample) # Return a vector of differences of proportion
}
bootstrapped_slopes <- bootstrap(faithful, 10000)
But how to get geom_line() or geom_smooth() to take the data from bootstrapped_slopes? Any assistance is much appreciated.
EDIT: More direct adaptation from the OP
For plotting, I presume you want both the slopes and the intercepts, so here's a modified bootstrap function:
bootstrap <- function(df, n_resample) {
# Note 2 dimensions here, for slope and intercept
slope_resample <- matrix(NA, 2, nrow = n_resample) # initialize vector
index <- 1:nrow(df) # create an index for supplied table
for (i in 1:n_resample) {
index_boot <- sample(index, replace = TRUE) # sample row numbers, with replacement
df_boot <- df[index_boot, ] # create a bootstrap sample from original data
a <- lm(waiting ~ eruptions, data=df_boot) # compute linear model
slope_resample[i, 1] <- slope <- a$coefficients[1] # take the slope
slope_resample[i, 2] <- intercept <- a$coefficients[2] # take the intercept
}
# Return a data frame with all the slopes and intercepts
return(as.data.frame(slope_resample))
}
Then run it and plot the lines from that data frame:
bootstrapped_slopes <- bootstrap(faithful, 10000)
library(dplyr); library(ggplot2)
ggplot(faithful, aes(eruptions, waiting)) +
geom_abline(data = bootstrapped_slopes %>%
sample_n(1000), # 10k lines look about the same as 1k, just darker and slower
aes(slope = V2, intercept = V1), #, group = id),
alpha = 0.01) +
geom_point(shape = 19, color = "red")
Alternative solution
This could also be done using modelr and broom to simplify some of the bootstrapping. Based on the main help example for modelr::bootstrap, we can do the following:
library(purrr); library(modelr); library(broom); library(dplyr)
set.seed(777)
# Creates bootstrap object with 10k extracts from faithful
boot <- modelr::bootstrap(faithful, 10000)
# Applies the linear regression to each
models <- map(boot$strap, ~ lm(waiting ~ eruptions, data = .))
# Extracts the model results into a tidy format
tidied <- map_df(models, broom::tidy, .id = "id")
# We just need the slope and intercept here
tidied_wide <- tidied %>% select(id, term, estimate) %>% spread(term, estimate)
ggplot(faithful, aes(eruptions, waiting)) +
geom_abline(data = tidied_wide %>%
sample_n(1000), # 10k lines look about the same as 1k, just darker and slower
aes(slope = eruptions, intercept = `(Intercept)`, group = id), alpha = 0.05) +
geom_point(shape = 19, color = "red") # spot_color wasn't provided in OP

How to assign colors to groups in survival graphs

I would like to have full control on the colors used to display group graphs using ggsurv.
You'll find below a toy example to reproduce what I've seen (mostly taken from there):
require(data.table)
# Function to create synthetic survival data
simulWeib <- function(N, lambda, rho, beta, rateC)
{
# covariate --> N Bernoulli trials
x <- sample(x=c(0, 1), size=N, replace=TRUE, prob=c(0.5, 0.5))
# Weibull latent event times
v <- runif(n=N)
Tlat <- (- log(v) / (lambda * exp(x * beta)))^(1 / rho)
# censoring times
C <- rexp(n=N, rate=rateC)
# follow-up times and event indicators
time <- pmin(Tlat, C)
status <- as.numeric(Tlat <= C)
# data set
data.frame(id=1:N,
time=time,
status=status,
x=x)
}
set.seed(1234)
nbGroups <- 7
dat <- list()
for(k in 1:nbGroups)
{
dat.onegp <- simulWeib(N=10, lambda=0.01, rho=1, beta=-0.6, rateC=0.001)
# fit <- coxph(Surv(time, status) ~ x, data=dat.onegp)
dat.onegp <- mutate(dat.onegp, Group = paste0("G",k))
dat[[k]] <- dat.onegp
}
dat.df <- rbindlist(dat)
dat.df.survCurv <- survfit( Surv(dat.df$time, dat.df$status) ~ dat.df$Group )
# Vector with colors to be used
cols = colorRampPalette(brewer.pal(9, "Set1"))(nbGroups)
ggsurv(dat.df.survCurv, size.est = 1 ) +
guides(linetype = FALSE) +
scale_colour_manual(name = "Exp. groups", breaks = sort(dat.df$Group), values = cols)
Running this twice will give two different sets of color-group assignment, and I don't want that. I need groups to always be displayed with the same color, for consistency with other graphs in a report.
NB: I have found out that the order in which colors are displayed is linked with the survival data, but I can't figure out how to force color assignment.
Any help appreciated!
Found it in this post !
The solution is to use limits instead of breaks
ggsurv(dat.df.survCurv, size.est = 1 ) +
guides(linetype = FALSE) +
scale_colour_manual(name = "Exp. groups", limits = sort(dat.df$Group), values = cols)

Bootstrap correlation CI with increasing sample size

I would like to demonstrate how the width of a 95% confidence interval around a correlation changes with increasing sample size, from n = 10 to n=100 in increments of 5 samples per round. I would assume we can use a bootstrap function to do this and replicate each round 1000 times. How can this be done in R?
See:
http://www.nicebread.de/at-what-sample-size-do-correlations-stabilize/
We can use the diamonds data:
data(diamonds)
x <- diamonds$price
y <- diamonds$carat
You can add the chart and axis titles yourself, but this code does what I think you're looking for using ggplot2 and the 'psychometric' package:
library(ggplot2)
library(psychometric)
corSamp <- function(x) {
# return the correlation between price and carat on diamonds for a given sample size
index <- sample(1:nrow(diamonds), x)
carat <- diamonds$carat[index]
price <- diamonds$price[index]
return(cor(carat, price))
}
cors <- sapply(seq(5,100,5), corSamp)
lower <- sapply(1:20, function(i) return(CIr(r = cors[i], n = seq(5,100,5)[i], level = 0.95)[1]))
upper <- sapply(1:20, function(i) return(CIr(r = cors[i], n = seq(5,100,5)[i], level = 0.95)[2]))
myData <- data.frame(cbind(cors, lower, upper, seq(5,100,5)))
myPlot <- ggplot(myData, aes(x = V4, y = cors)) + geom_line() + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.5)
Here V4 is the sample size.
You can loop through your sample sizes with sapply, and for each sample size draw 1000 random samples of the appropriate size, reporting the average width of the confidence interval:
set.seed(144)
ci.widths <- sapply(seq(10, 100, 5), function(x) mean(replicate(1000, {
r <- sample(nrow(diamonds), x, replace=TRUE)
diff(cor.test(diamonds$price[r], diamonds$carat[r])$conf.int)
})))
plot(seq(10, 100, 5), ci.widths, xlab="Sample size", ylab="CI width")

Resources