Apply different data to a function in R - r

I have the following data frame:
library(tidyverse)
set.seed(1234)
df <- data.frame(
x = seq(1, 100, 1),
y = rnorm(100)
)
Where I apply a smooth spline using different knots:
nknots <- seq(4, 15, 1)
output <- map(nknots, ~ smooth.spline(x = df$x, y = df$y, nknots = .x))
What I need to do now is to apply the same function using 2-point and 3-point averages:
df_2 <- df %>%
group_by(., x = round(.$x/2)*2) %>%
summarise_all(funs(mean))
df_3 <- df %>%
group_by(., x = round(.$x/3)*3) %>%
summarise_all(funs(mean))
In summary, I need to apply the function I used in output with the following data frames:
df
df_2
df_3
Of course, this is a minimal example, so I am looking for a efficient way of doing it. Preferably with the purrr package.

Using lapply, and the library zoo to calculate the moving average in a more simple and elegant manner:
library(zoo)
lapply(1:3,function(roll){
dftemp <- as.data.frame(rollmean(df,roll))
map(nknots, ~ smooth.spline(x = dftemp$x, y = dftemp$y, nknots = .x))
})

Here's one possible solution:
library(tidyverse)
set.seed(1234)
df <- data.frame(x = seq(1, 100, 1),
y = rnorm(100))
# funtion to get v-point averages
GetAverages = function(v) {
df %>%
group_by(., x = round(.$x/v)*v) %>%
summarise_all(funs(mean)) }
# specify nunber of knots
nknots <- seq(4, 15, 1)
dt_res = tibble(v=1:3) %>% # specify v-point averages
mutate(d = map(v, GetAverages)) %>% # get data for each v-point
crossing(., data.frame(nknots=nknots)) %>% # combine each dataset with a knot
mutate(res = map2(d, nknots, ~smooth.spline(x = .x$x, y = .x$y, nknots = .y))) # apply smooth spline
You can use dt_res$res[dt_res$v == 1] to see all results for your original daatset, dt_res$res[dt_res$v == 2] to see results for your 2-point estimate, etc.

Related

Using map to fit model to multiple datasets in a list

I am trying to sample from a dataset, select columns, and fit a model to each sample.
library(bartMachine)
library(tidyverse)
#dataset
dat <- data.frame( x=rnorm(20, mean=5, sd=1), z=rnorm(20, mean=2, sd=0.5), y=rnorm(20, mean=6, sd=1), weight = rnorm(20, mean=1, sd=0.2) )
#variable names
var1 <- dat %>% select(x, z) %>% names()
var2 <- dat %>% select(y) %>% names()
out <- tibble(run = 1:20) %>%
mutate(
data = map(run, ~ sample_n(dat, size = 5, weight = weight, replace = TRUE)),
mdl = map(data[[run]], ~bartMachine(X=as.data.frame(.x) %>% select(all_of(var1)),
y=as.data.frame(.x) %>% select(all_of(var2)) %>% unlist(), seed = 1)),
smry = map(mdl, ~ summary(.x))
)
Credit to #r2evans of helping with the above code
I get this error:
Error in `mutate()`:
! Problem while computing `mdl = map(...)`.
Caused by error in `data[[run]]`:
! recursive indexing failed at level 3
Run `rlang::last_error()` to see where the error occurred.
Since out$data[[1]] gives me the first dataset in the list, why is data[[run]] not working?

R, Simulation, p-value, histogram

library(tidyverse)
library(broom)
library(dplyr)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum)) %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
j=exp(sim %>% select("estimate"))
OR=as.numeric(unlist(j))
mean(OR)
hist(OR,main=NULL,freq=T,breaks=10)
abline(v=mean(OR),lwd=4,col=1)
The question here: now I extract all the value which p<0.05, now I using the code "hist(OR,main=NULL,freq=T,breaks=10)" to make a histogram for the odds ratio. The new thing I want to do is make another histogram(like without any condition for p-value) overlapping the original one, then I could compare the histogram with the different p-value in one plot, which code can work with that?
This solution repeats the question's code but
stop the pipe right after unnest(cols = c(sum));
create a simOR like you have continued the pipe and a simAll but this time not filtering the p-values.
First the question's code. Note that if package tidyverse is loaded there is no need to load package dplyr.
I also set the RNG seed to make the results reproducible.
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
set.seed(2020)
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum))
Now create the two data sets to be plotted.
simOR <- sim %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)", p.value < 0.05)
j <- exp(simOR %>% select("estimate"))
OR <- as.numeric(unlist(j))
mean(OR)
And the data set with all the rows, dropping only the intercepts.
simAll <- sim %>%
filter(term !="(Intercept)")
j <- exp(simAll %>% select("estimate"))
All <- as.numeric(unlist(j))
mean(All)
Now plot the histograms (not overlapped).
op <- par(mfrow = c(2, 1))
hist(OR, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(OR), lwd = 4, col = 1)
hist(All, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(All), lwd = 4, col = 1)
par(op)

Rolling window slider::slide() with grouped data

In the following example I try to compute the first coefficient from a linear model for time t = 1 until t. It's an expanding rolling window.
It works well with ungrouped data, but when grouped by case, I get the error Error: Columncoef1must be length 10 (the group size) or one, not 30.
How can I handle grouped data?
library(dplyr)
library(slider)
get_coef1 <- function(data) {
coef1 <- lm(data = data, r1 ~ r2 + r3) %>%
coef() %>%
.["r2"] %>%
unname()
return(coef1)
}
data <- tibble(t = rep(1:10, 3),
case = c(rep("a", 10), rep("b", 10), rep("c", 10)),
r1 = rnorm(30),
r2 = rnorm(30),
r3 = rnorm(30))
data %>%
# ungroup() %>%
group_by(case) %>%
mutate(coef1 = slider::slide_dbl(., ~get_coef1(.x),
.before = Inf, .complete = T))
You have to first tidyr::nest the cases. Within the nested tibbles (accessed via purrr::map) you can then apply slide (same technique as with purrr::map). The important point is that you do not want to slide across cases, but only within cases.
library(dplyr)
library(tidyr)
library(purrr)
library(slider)
get_coef1 <- function(data) {
coef1 <- lm(data = data, r1 ~ r2 + r3) %>%
coef() %>%
.["r2"] %>%
unname()
return(coef1)
}
data <- tibble(t = rep(1:10, 3),
case = c(rep("a", 10), rep("b", 10), rep("c", 10)),
r1 = rnorm(30),
r2 = rnorm(30),
r3 = rnorm(30))
data %>%
# ungroup() %>%
group_by(case) %>% nest() %>%
mutate(rollreg = map(data, ~ .x %>% mutate(coef1 = slider::slide_dbl(., ~get_coef1(.x), .before = Inf, .complete = TRUE)))) %>%
select(-data) %>% unnest(rollreg)
I have been trying for a while to use the new dplyr::nest_by() from dplyr 1.0.0 trying to use summarise in combination with the rowwise cases but couldn't get that to work.
I realize this is an old post, but for the sake of completeness, I offer another solution. Is this what you're looking for? Two subtle changes to the arguments to slide_dbl. The code runs.
data %>%
# ungroup() %>%
group_by(case) %>%
mutate(coef1 = slider::slide_dbl(.x = cur_data(), # use cur_data() instead of .; arg .x
.f = ~get_coef1(.x), # arg .f
.before = Inf, .complete = T))
See the slider() documentation for underlying reasons.

How to get formulas of multiple regressions by vectorizing

Suppose I have the following code that makes multiple regressions and stores the lm and lm with stepwise selection models in tibbles:
library(dplyr)
library(tibble)
library(MASS)
set.seed(1)
df <- data.frame(A = sample(3, 10, replace = T),
B = sample(100, 10, replace = T),
C = sample(100, 10, replace = T))
df <- df %>% arrange(A)
formula_df <- as.tibble(NA)
aic_df <- as.tibble(NA)
for (i in unique(df$A)){
temp <- df %>% filter(A == i)
formula_df[i, 1] <- temp %>%
do(model = lm(B ~ C, data = .))
aic_df[i, 1] <- temp %>%
do(model = stepAIC(formula_df[[1,1]], direction = "both", trace = F))
}
Is it possible to vectorize to make it faster, for example using the *pply functions? The loop becomes extremely slow when the data gets larger. Thank you in advance.
You could try something like:
model <- df %>% group_by(A) %>%
summarise(formula_model = list(lm(B ~ C))) %>%
mutate(aic_model = list(stepAIC(.[[1,2]], direction = "both", trace = F)))

rowwise filtering in dplyr

I wanted to use dplyr instead of apply,1 in order to filter a dataset rowwise according to a logical expression, ie for this example I´d like to remove all rows that have one or more values of 99.
However, I was surprised by the poor performance in dplyr. Any ideas if I can speed this up in dplyr? Also, I would have thought that the rowwise function would pipe the individual rows, but apparently not (see below). How can I use the rowwise function?
library(tidyverse)
s <- tibble(rows = seq(from = 250, to = 5000, by = 250)) #my original dataset has 400K rows...
s$num <- map(s$rows, ~ rnorm(.x * 6))
s$num <-
map(s$num, ~ replace(.x, sample(1:length(.x), size = length(.x) / 20), 99))
s$mat <- map(s$num, ~ as_data_frame(matrix(.x, ncol = 6)))
help_an <- function(vec) {
browser()
return(!any(vec == 99))
}
help_dp_t <- function(df) {
clo1 <- proc.time()
a <- as_data_frame(t(df)) %>% summarise_all(help_an)
df2 <- filter(df, t(a)[, 1])
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
return(b)
}
s$dplyr <- map(s$mat, ~ dplyr::mutate(help_dp_t(.x)))
help_lap <- function(df) {
clo1 <- proc.time()
a_base <- df[apply(df, 1, function(x)
! any(x == 99)), ]
b <- tibble(time = (proc.time() - clo1)[3], df = list(a_base))
return(b)
}
s$lapply <- map(s$mat, ~ mutate(help_lap(.x)))
s$equal_dplyr_lapply <-
map2_lgl(s$dplyr, s$lapply, ~ all.equal(.x$df, .y$df))
s$dplyr_time <- map_dbl(s$dplyr, "time")
s$lapply_time <- map_dbl(s$lapply, "time")
ggplot(gather(s, ... = c(7, 8)), aes(x = rows, y = value, color = key)) +
geom_line()
I tried the following with rowwise, but the rowwise pipe does not send a vector, but the entire df to the help_an function.
help_dp_r <- function(df) {
clo1 <- proc.time()
df2 <-
df %>% rowwise() %>% mutate(cond = help_an(.)) ### . is not passed on as a vector, but the entire df??
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
}
s$dplyr_r <- map(s$mat, ~ dplyr::mutate(help_dp_r(.x)))

Resources