R, Simulation, p-value, histogram - r

library(tidyverse)
library(broom)
library(dplyr)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum)) %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
j=exp(sim %>% select("estimate"))
OR=as.numeric(unlist(j))
mean(OR)
hist(OR,main=NULL,freq=T,breaks=10)
abline(v=mean(OR),lwd=4,col=1)
The question here: now I extract all the value which p<0.05, now I using the code "hist(OR,main=NULL,freq=T,breaks=10)" to make a histogram for the odds ratio. The new thing I want to do is make another histogram(like without any condition for p-value) overlapping the original one, then I could compare the histogram with the different p-value in one plot, which code can work with that?

This solution repeats the question's code but
stop the pipe right after unnest(cols = c(sum));
create a simOR like you have continued the pipe and a simAll but this time not filtering the p-values.
First the question's code. Note that if package tidyverse is loaded there is no need to load package dplyr.
I also set the RNG seed to make the results reproducible.
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
set.seed(2020)
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum))
Now create the two data sets to be plotted.
simOR <- sim %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)", p.value < 0.05)
j <- exp(simOR %>% select("estimate"))
OR <- as.numeric(unlist(j))
mean(OR)
And the data set with all the rows, dropping only the intercepts.
simAll <- sim %>%
filter(term !="(Intercept)")
j <- exp(simAll %>% select("estimate"))
All <- as.numeric(unlist(j))
mean(All)
Now plot the histograms (not overlapped).
op <- par(mfrow = c(2, 1))
hist(OR, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(OR), lwd = 4, col = 1)
hist(All, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(All), lwd = 4, col = 1)
par(op)

Related

Multiple linear regression by group in a rolling window in R

My dataframe looks like this:
Date = c(rep(as.Date(seq(15000,15012)),2))
Group = c(rep("a",13),rep("b",13))
y = c(seq(1,26,1))
x1 = c(seq(0.01,0.26,0.01))
x2 = c(seq(0.02,0.26*2,0.02))
df = data.frame(Group,Date,y,x1,x2)
head(df,3)
Group
Date
y
x1
x2
a
2011-01-26
1
0.01
0.02
a
2011-01-27
2
0.02
0.04
a
2011-01-28
3
0.03
0.06
And I would like to do multiple regression by group (y as the dependent variable and x1, x2 as the independent variables) in a rolling window i.e. 3.
I have tried to achieve this using packages tidyverse and zoo with following codes but failed.
## define multi-var-linear regression function and get the residual
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
## apply it by group with rolling window
x <- df %>% group_by(Group) %>%
rollapplyr(. , width = 3, FUN = rsd)
The output of this code is not what I acutually want.
Does anyone know how to do multiple regression by group in a rolling window?
Thanks in advance, Giselle
Thank Grothendieck and Marcus for your codes!
It really helped me a lot:)
I now appened them here:
# Grothendieck method
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
width <- 5
df_m2 <-
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup %>%
select(c("Group","Date","5")) %>%
dplyr::rename(residual_m2 = "5")
# Marcus method
output <- data.frame()
for (i in unique(df$Group)) {
a = df%>% subset(Group==i)
a[,"residual"] = NA
max = nrow(a)
if(max<5){
next
}
for (j in seq(5,max,by=1)) {
b = a %>% slice((j-4):j)
lm_ = lm(y~x1+x2, data = b)
a[j,]$residual = residuals(lm_)[5]
}
output <-
output %>%
rbind(a)
}
Use group_modify and use rollapplyr with the by.column = FALSE argument so that rsd is applied to all columns at once rather than one at a time.
Note that if you use width 3 with two predictors and an intercept the residuals will necessarily be all zero so we changed the width to 5.
library(dplyr, exclude = c("lag", "filter"))
library(zoo)
width <- 5
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup
A good old-fashioned for-loop here could be:
for (i in unique(df$Group)){
for (j in (seq(15000,15012, 3))){
lm_ <- lm(formula = df[df$Group== i & df$Date %in% c(j, j+1, j+2), 3] ~ df[df$Group== i & df$Date %in% c(j, j+1, j+2), 4] + df[df$Group== i & df$Date %in% c(j, j+1, j+2), 5], na.action = na.omit)
print(paste('Group', i, 'Dates from', j, 'to', j+3, residuals(lm_)))
}
}

Using map() in a split-apply-combine to run multiple regressions with regression weights

Say I have some data that looks like this:
N <- 200
X <- sample(letters[1:5],N, replace = T)
Y <- rnorm(N)
W <- abs(rnorm(N))
my_data <- tibble(X, Y, W)
I want to run an intercept-only regression on each subset of my data defined by X. To do so, I use nest(), mutate() and map() like so:
my_data %>%
group_by(X) %>%
nest() %>%
mutate(fit = map(data, ~lm(Y ~ 1, data = .x)))
While this code works, when I try to incorporate regression weights, like this:
my_data %>%
group_by(X) %>%
nest() %>%
mutate(fit = map(data, ~lm(Y ~ 1, data = .x, weights = W)))
I get the following error:
Error: Problem with `mutate()` input `fit`.
x missing or negative weights not allowed
ℹ Input `fit` is `map(data, ~lm(Y ~ 1, data = .x, weights = W))`.
ℹ The error occurred in group 1: X = "a".
Run `rlang::last_error()` to see where the error occurred.
Where am I going wrong?
(Disclaimer: I am new to the tidyverse so am likely doing something dumb)
Write it this way:
my_data %>%
group_by(X) %>%
nest() %>%
mutate(fit = map(data, ~lm(Y ~ 1, data = .x, weights = .x$W)))
The point is that at that point W is not an existing column anymore.
It exists only inside data. With map you are looping over data which at that point is a list of dataframes.
Therefore to call W, you need to call it through $
Also, define W as:
W <- abs(rnorm(N))
Because you can't have negative weights.
Alternatively, you can do it as follow:
my_data %>%
group_by(X) %>%
summarise(fit = list(lm(Y ~ 1, weights = W)))
It will give you the same result. [almost: because X will be sorted]
Check this out:
fit1 <- my_data %>%
arrange(X) %>%
group_by(X) %>%
nest() %>%
mutate(fit = map(data, ~lm(Y ~ 1, data = .x, weights = .x$W))) %>%
pull(fit)
fit2 <- my_data %>%
group_by(X) %>%
summarise(fit = list(lm(Y ~ 1, weights = W))) %>%
pull(fit)
identical(map(fit1, coef), map(fit2, coef))
#> TRUE
If you just need the coefficients, you can do it this way:
my_data %>%
group_by(X) %>%
summarise(fit = coef(lm(Y ~ 1, weights = W)))

Summary statistics for simple linear regressions stored in a list using lapply

I have a series of simple linear regressions of the form y ~ x1, y ~x2, y~ x3 etc.
I have been able to run all my linear regressions and have stored the output, but I am having trouble to access the summary statistics in a vectorized way e.g. adjusted R squared for each model.
I could do it via a for loop and iterate through each model, but I think there must be easier way to perhaps use lapply (or sapply?) and get the result quicker.
A reproducible example is below:
library(tidyverse)
library(broom)
set.seed(6)
DF <- data.frame(Y=rnorm(50, 100, 3),
X1=rnorm(50, 100, 3),
X2=rnorm(50, 100, 3),
X3=rnorm(50, 100, 3),
X4=rnorm(50, 100, 3))
DF_longer = pivot_longer(DF, -Y, names_to = "variable", values_to = "value", values_ptypes = list(val = 'numeric'))
lm1 = DF_longer %>% group_by(variable) %>% do(tidy(lm(Y ~ value, data=.)))
lm2 = DF_longer %>% group_by(variable) %>% do(mod = lm(Y ~ value, data=.))
The part that I would like to optimise is the following where I would like to store the adjusted R squared for each model in a vector without a for loop.
lm2_data = summary(lm2$mod[[1]])
lm2_data$adj.r.squared
lm2_data = summary(lm2$mod[[2]])
lm2_data$adj.r.squared
lm2_data = summary(lm2$mod[[3]])
lm2_data$adj.r.squared
lm2_data = summary(lm2$mod[[4]])
lm2_data$adj.r.squared
Here's an example using the mtcars data:
regModels <- c("mpg ~ am", "mpg ~ am + wt", "mpg ~ wt + am + disp")
results <- lapply(regModels,function(x){
y <- summary(lm(x,data = mtcars))$adj.r.squared
})
names(results) <- regModels
results
...and the output:
> results
$`mpg ~ am`
[1] 0.3384589
$`mpg ~ am + wt`
[1] 0.7357889
$`mpg ~ wt + am + disp`
[1] 0.757583
>
Using data from the original post...
library(tidyverse)
library(broom)
set.seed(6)
DF <- data.frame(Y=rnorm(50, 100, 3),
X1=rnorm(50, 100, 3),
X2=rnorm(50, 100, 3),
X3=rnorm(50, 100, 3),
X4=rnorm(50, 100, 3))
DF_longer = pivot_longer(DF, -Y, names_to = "variable", values_to = "value", values_ptypes = list(val = 'numeric'))
lm1 = DF_longer %>% group_by(variable) %>% do(tidy(lm(Y ~ value, data=.)))
lm2 = DF_longer %>% group_by(variable) %>% do(mod = lm(Y ~ value, data=.))
adjRsquared <- lapply(lm2$mod,function(x){
y <- summary(x)$adj.r.squared
})
names(adjRsquared) <- lm2$variable
adjRsquared
...and the output:
> adjRsquared
$X1
[1] -0.007637371
$X2
[1] 0.007729944
$X3
[1] 0.04993542
$X4
[1] -0.02026235
The sapply() function executes a function on each element of a list and gives you a vector:
sapply(lm2$mod, function(x) summary(x)$adj.r.squared)
In base R you may use reformulate to get a formula vector fov, over which you do the regressions and simultaneously extract the adj. R2 in an sapply.
fov <- lapply(names(DF)[2:5], reformulate, "Y")
res <- sapply(fov, function(x) summary(lm(x, data=DF))$adj)
res
# [1] -0.007637371 0.007729944 0.049935424 -0.020262353
In a single step do:
sapply(lapply(names(DF)[2:5], reformulate, "Y"), function(x) summary(lm(x, data=DF))$adj)
# [1] -0.007637371 0.007729944 0.049935424 -0.020262353
Note: You stated you want a vector, if you want a list just replace the sapply with lapply.

Apply different data to a function in R

I have the following data frame:
library(tidyverse)
set.seed(1234)
df <- data.frame(
x = seq(1, 100, 1),
y = rnorm(100)
)
Where I apply a smooth spline using different knots:
nknots <- seq(4, 15, 1)
output <- map(nknots, ~ smooth.spline(x = df$x, y = df$y, nknots = .x))
What I need to do now is to apply the same function using 2-point and 3-point averages:
df_2 <- df %>%
group_by(., x = round(.$x/2)*2) %>%
summarise_all(funs(mean))
df_3 <- df %>%
group_by(., x = round(.$x/3)*3) %>%
summarise_all(funs(mean))
In summary, I need to apply the function I used in output with the following data frames:
df
df_2
df_3
Of course, this is a minimal example, so I am looking for a efficient way of doing it. Preferably with the purrr package.
Using lapply, and the library zoo to calculate the moving average in a more simple and elegant manner:
library(zoo)
lapply(1:3,function(roll){
dftemp <- as.data.frame(rollmean(df,roll))
map(nknots, ~ smooth.spline(x = dftemp$x, y = dftemp$y, nknots = .x))
})
Here's one possible solution:
library(tidyverse)
set.seed(1234)
df <- data.frame(x = seq(1, 100, 1),
y = rnorm(100))
# funtion to get v-point averages
GetAverages = function(v) {
df %>%
group_by(., x = round(.$x/v)*v) %>%
summarise_all(funs(mean)) }
# specify nunber of knots
nknots <- seq(4, 15, 1)
dt_res = tibble(v=1:3) %>% # specify v-point averages
mutate(d = map(v, GetAverages)) %>% # get data for each v-point
crossing(., data.frame(nknots=nknots)) %>% # combine each dataset with a knot
mutate(res = map2(d, nknots, ~smooth.spline(x = .x$x, y = .x$y, nknots = .y))) # apply smooth spline
You can use dt_res$res[dt_res$v == 1] to see all results for your original daatset, dt_res$res[dt_res$v == 2] to see results for your 2-point estimate, etc.

Fastest way to get residuals from linear, poisson, and negative binomial regressions across many responses

I have a numeric, a count, and an over-dispersed count large matrices:
set.seed(1)
numeric.mat <- matrix(rnorm(10000*6000),10000,6000)
count.mat <- matrix(rpois(10000*6000,10),10000,6000)
dispersed.count.mat <- matrix(rnegbin(10000*6000,10,2),10000,6000)
And one corresponding factors data.frame (can be a matrix too):
factors.df <- data.frame(f1 = sample(LETTERS[1:3], 10000, replace = T),
f2 = sample(LETTERS[4:5], 10000, replace = T))
The number of factors is pretty small (in this case only 2 but won't be more than 5 for real data), and the number of levels in each (they're all categorical) is also small (also up to 5).
I'd like to obtain the residuals for fitting a linear, poisson, and negative binomial regression models to each of the columns in each of the matrices, respectively.
So for a single column:
data.df <- factors.df %>%
dplyr::mutate(numeric.y = numeric.mat[,1],
count.y = count.mat[,1],
dispersed.count.y = dispersed.count.mat[,1])
I'd use:
lm(numeric.y ~ f1+f2, data = data.df)$residuals
residuals(object = glm(count.y ~ f1+f2, data = data.df, family = "poisson"), type = 'pearson')
residuals(object = glm.nb(formula = model.formula, data = regression.df), type = 'pearson')
For the three regression models.
Is there a faster way of obtaining these residuals other than, for example, using do.call, for each. E.g.:
do.call(cbind,
lapply(1:ncol(numeric.mat),
function(i)
lm(numeric.y ~ f1+f2,
data = dplyr::mutate(factors.df,
numeric.y = numeric.mat[,i])
)$residuals
))
I'd slightly readjust how the workflow runs and allow it to be easily run in parallel.
# Use variables to adjust models, makes it easier to change sizes
iter <- 60
iter_samps <- 1000
factors_df <- data.frame(f1 = sample(LETTERS[1:3], iter_samps, replace = T),
f2 = sample(LETTERS[4:5], iter_samps, replace = T))
# using a data.frame in a longer format to hold the data, allows easier splitting
data_df <- rep(list(factors_df), iter) %>%
bind_rows(.id = "id") %>%
mutate(numeric_y = rnorm(iter_samps * iter),
count_y = rpois(iter_samps * iter, 10),
dispersed_count_y = MASS::rnegbin(iter_samps * iter, 10, 2))
# creating function that determines residuals
model_residuals <- function(data) {
data$lm_resid <- lm(numeric_y ~ f1+f2, data = data)$residuals
data$glm_resid <- residuals(object = glm(count_y ~ f1+f2, data = data, family = "poisson"), type = 'pearson')
return(data)
}
# How to run the models not in parallel
data_df %>%
split(.$id) %>%
map(model_residuals) %>%
bind_rows()
To run the models in parallel you can use multidplyr to do all the annoying work
library("multidplyr")
test = data_df %>%
partition(id) %>%
cluster_library("tidyverse") %>%
cluster_library("MASS") %>%
cluster_assign_value("model_residuals", model_residuals) %>%
do(results = model_residuals(.)) %>%
collect() %>%
.$results %>%
bind_rows()

Resources