Using group_by in function - r

I have a function written to calculate the confidence interval of a ratio of averages between two vectors using jackknife standard errors
jackknife_CI = function(x, y, alpha = .05) {
xl = (sum(x,na.rm=T) - x) / (length(x) - 1)
yl = (sum(y,na.rm=T) - y) / (length(y) - 1)
n = length(x) + length(y)
jack_se = (sd(c(xl / mean(y,na.rm=T), mean(x,na.rm=T) / yl),na.rm=T) * (n - 1)) / sqrt(n)
mean(x, na.rm = T) / mean(y, na.rm = T) + jack_se * qnorm(c(alpha/2,1-alpha/2))
}
I want to then use it with the ToothGrowth dataset in the following way:
df1 =
ToothGrowth %>%
filter(supp == "OJ") %>%
rename(len_x = len) %>%
select(dose,len_x)
df2 =
ToothGrowth %>%
filter(supp == "VC") %>%
rename(len_y = len) %>%
select(dose, len_y)
df = cbind(df1,df2)
df = df[,-3]
jack_CI = df %>% group_by(dose) %>% jackknife_CI(x = len_x, y = len_y)
My problem is that the last line results in the error:
Error in jackknife_CI(., x = len_x, y = len_y) : object 'len_x' not found
How do I get around this?

The last line need to be:
jack_CI = jackknife_CI(x = df$len_x, y = df$len_y)
The way you are running it is being interpreted as follows:
jack_CI = jackknife_CI(group_by(df, dose), x = len_x, y = len_y)
Which is causing a couple issues:
jackknife_CI is not expecting the first argument to be the dataframe. (because of pipe operator)
len_x and len_y are not recognized outside of the dataframe.
If you want to run the function on each group you can do:
df %>% group_by(dose) %>%
do({
ci <- jackknife_CI(.$len_x, .$len_y)
tibble(low = ci[1], hi = ci[2])
})
I use do because the function returns two values. Otherwise you would be able to just use summarize. Each group is being passed to do which is then returning a tibble (note the last line in do) which are then being stacked to return the result. I am referring to each group inside of do with .$variable_name where the dot references the value being passed (in this case the dataframe for each group)

Related

argument "" is missing, with no default when using map_dfc in R

I am trying to modify Monte Carlo code from Reproduceable Finance with R to handle withdrawals and inflation of these withdrawals.
to do so I have changed a simulation accumulate function to subtract a withdrawl amount that has been increased do to inflation prior to calling the function.
simulation_accum_withdrawls <- function(init_value, N, mean, stdev, wAmt) {
tibble(c(init_value, 1 + rnorm(N, mean, stdev),wAmt)) %>%
`colnames<-`("returns") %>%
mutate(growth =
accumulate(returns, function(x, y, wAmt) x * y - wAmt)) %>%
select(growth)
}
the function is run from the following code
monte_carlo_sim <-
map_dfc(starts, simulation_accum_withdrawls,
N = sim_duration_months,
mean = geo_mean_port_return,
stdev = geo_stddev_port_return,
wAmt = inflated_withdrawls)
which throws off the following error.
Error in map():
ℹ In index: 1.
Caused by error in mutate():
! Problem while computing growth = accumulate(returns, function(x, y, wAmt) x * y - wAmt).
Caused by error in fn():
! argument "wAmt" is missing, with no default
Backtrace:
purrr::map_dfc(...)
purrr::accumulate(returns, function(x, y, wAmt) x * y - wAmt)
purrr:::reduce_impl(...)
fn(out, elt, ...)
Error in map(.x, .f, ...) :
The inflated_withdrawls data frame does exist and is not empty i.e. see below
str(inflated_withdrawls)
'data.frame': 38 obs. of 1 variable:
$ wAmt: num 1805391 0 0 0 0 ...
Any advice or insight on how to debug this would be appreciated.
Thanks
Here is an executable example per the request.
library(tidyverse)
library(highcharter)
library(tidyquant)
library(timetk)
library(broom)
library(highcharter)
library(purrr)
library(knitr)
library(readxl)
accum_inflation <- function(init_value, inflation=0.0) {
tibble(c(init_value, 1 + inflation)) %>%
`colnames<-`("acc_inf") %>%
mutate(acc_inf =
accumulate(acc_inf, function(x, y) x * y ))
}
accum_wd <- function(wAmt, acc_inf=0.0) {
growth = wAmt * acc_inf
}
simulation_accum_1 <- function(init_value, N, mean, stdev,inf_wd) {
tibble(c(init_value, 1 + rnorm(N, mean, stdev), inf_wd)) %>%
`colnames<-`("returns") %>%
mutate(growth =
accumulate(returns, function(x, y) x * y - inf_wd)) %>%
select(growth)
}
simulation_accum_withdrawls <- function(init_value, N, mean, stdev, wAmt=0) {
tibble(c(init_value, 1 + rnorm(N, mean, stdev),wAmt)) %>%
`colnames<-`("returns") %>%
mutate(growth =
accumulate(returns, function(x, y, wAmt) x * y + wAmt)) %>%
select(growth)
}
test_df <- data.frame(wAmt =c(100000,-180000,-180000,-180000),
date=seq.Date(as.Date("2024-01-01"), by = "year", length.out = 4)
)
test_df <- test_df %>% pad_by_time(date, .by = "month", .pad_value = 0)
inf_df <- rnorm(nrow(test_df),0.03/12,0.042/12)
inflated_returns <- accum_inflation(1, inf_df)
edatep1 <- ymd(edate)+months(1)
test_df<- rows_append(test_df,tibble(wAmt = 0, date = edatep1))
inflated_withdrawls <- accum_wd(test_df[1], inflated_returns)
nav=1000000
sim_duration_months = 40
total_sims = 10
geo_mean_port_return = 0.00443402454379282
geo_stddev_port_return = 0.0237813751473552
sims <- total_sims
starts <-
rep(nav, sims) %>%
set_names(paste("sim", 1:sims, sep = ""))
monte_carlo_sim <-
map_dfc(starts, simulation_accum_withdrawls,
N = sim_duration_months,
mean = geo_mean_port_return,
stdev = geo_stddev_port_return,
wAmt = inflated_withdrawls)

How do I subset a column of the current group in dplyr?

I am having difficulty parameterizing some code I wrote. It works fine when not in a function like this:
new_df = group_by(groupby1, groupby2) %>%
mutate(new_value=
slider_helper(
slide(cur_data()[, c('string1', 'string2'], ~.x, .before = Inf, .after = -1),
cur_data()$string2,
'string1',
beta
But when I try to create a function where you can pass strings for the variables to group by and the variables to slide over like so:
my_fun <- function(df, groupby1, groupby2, string1, string2, beta) {
return(df %>%
group_by({{groupby1}}, {{groupby2}}) %>%
mutate(new_value=
slider_helper(
slide(cur_data()[, c({{string1}}, {{string2}}], ~.x, .before = Inf, .after = -1),
cur_data()[[{{string2}}]],
{{string1}},
beta)))
}
I get this vague stack trace:
The error occurred in group 1: "groupby1" = "groupby1", "groupby2" = "groupby2".
Caused by error in `.subset()`:
! invalid subscript type 'closure'
What is the proper way to parameterize dplyr functions to work with column names passed as strings?
EDIT:
Here is a reproducible example
slider_helper <- function(left, right, string1, beta) {
cbind_helper <- function(left, right) {
todaysDate = rep(right, nrow(left))
return(cbind(left, todaysDate))
}
date_helper <- function(today, date) {
return(1/as.integer(today - date))
}
df = data.frame(t(mapply(cbind_helper, left, right)))
df$val1= mapply(date_helper, df[,'todaysDate'], df[, date])
df$val1_product= mapply('%*%', df$val1, df[[target]]) / sapply(df$val1, FUN=sum, na.rm=T)
df$val2= 1/seq(nrow(df), 1)
df$val2_product= sapply(mapply('*', df$val2, df[[target]]), FUN=sum, na.rm=T) / sum(df$val2, na.rm=T)
w_sum = beta * df$val2_product+ (1-beta) * df$val2_product
return(w_sum)
}
my_fun <- function(df, groupby1, groupby2, string1, string2, beta) {
return(df %>%
group_by({{groupby1}}, {{groupby2}}) %>%
mutate(new_value=
slider_helper(
slide(cur_data()[, c({{string1}}, {{string2}})], ~.x, .before = Inf, .after = -1),
cur_data()[[{{string2}}]],
{{string1}},
beta)))
}
df= data.frame(sample(1:2, 20, replace=T), sample(1:2, 20, replace=T), seq(from=-1, to=.9, by = .1), seq.Date(from=as.Date('2011-01-01'), to=as.Date('2011-01-20'), by = 1))
colnames(df) = c('groupby1', 'groupby2', 'string1', 'string2')
my_fun(testy, 'groupby1', 'groupby2', 'string1', 'string2', 0.5)
Without sample data and more context this is kind of a guess.
But, if you are using a simple function then you need to remove the {{}}. The function does not need those brackets.
my_fun <- function(df, groupby1, groupby2, string1, string2, beta) {
return(df %>%
group_by(groupby1, groupby2) %>%
mutate(new_value=
slider_helper(
slide(cur_data()[, c(string1, string2)], ~.x, .before = Inf, .after = -1),
cur_data()[[string2]],
string1,
beta)))
}

Call the same function by varying parameters with tidyverse and apply family functions

This is my first question here on Stack Overflow, so I apologize in advance if I won't be clear enough.
I searched for similar questions, but I didn't find anything (I probably didn't search enough!)
Given a data.frame (or a data.table or a tibble) consisting of four sets of points divided into two groups:
df_points <- tibble(
x = c(rnorm(10000, mean = 0), rnorm(10000, mean = 1),
rnorm(10000, mean = 0), rnorm(10000, mean = 4)),
dist = c(rep("d1", 10000), rep("d2", 10000),
rep("d1", 10000), rep("d2", 10000)),
overlap = c(rep("o1", 20000), rep("o2", 20000))
)
my goal is to apply the density function, using different values of bw, from and to for the "o1" and "o2" groups.
I would like to solve this problem in an elegant way with both a tidyverse and a R-base-data.table approach (apply family functions).
For now I have managed to do this via tidyverse:
I define a common_dens function which applies density and returns a tibble of the x and y of the distribution
common_dens <- function(df, Bw, lower, upper) {
d <- density(df, n = 2048, bw = Bw, from = lower, to = upper)
df_d <- tibble(x = d$x, y = d$y)
return(df_d)
}
assuming that the values of upper, lower and bws are the following:
lower <- c(-5.050, -4.705)
upper <- c(6.445, 9.070)
bws <- c(0.1427, 0.1417)
I get the desired dataframe through the following for loop:
df_dens <- NULL
for (i in 1:2) {
df_t <- df_points %>%
filter(overlap == unique(df_points$overlap)[[i]]) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x, bws[i], lower[i], upper[i]))
df_dens <- rbind(df_dens, df_t)
}
Is there any way to remove the for loop?
Is there a way to do the same with apply family functions and data.table?
Thanks for your help!
The purrr::pmap function allows you to apply an arbitrary number of parameters to a function in succession. The pmap_dfr returns a data.frame bound by row:
Consider your parameters provided as a data.frame:
params <- data.frame(group = c("o1","o2"), bws, lower, upper)
group bws lower upper
1 o1 0.1427 -5.050 6.445
2 o2 0.1417 -4.705 9.070
The paramters are automatically assigned to the special symbols ..1, ..2, and so on:
library(purrr)
pmap_dfr(params, ~ df_points %>%
filter(overlap == ..1) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x,Bw = ..2, lower = ..3, upper = ..4)))
It can get confusing which ..# is which, so a trick is to use with(list(...), ):
pmap_dfr(params, ~ with(list(...), df_points %>%
filter(overlap == group) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x,Bw = bws, lower = lower, upper = upper))))
You could of course do the same with base R apply:
apply(params, 1, function(y){ df_points %>%
filter(overlap == y[1]) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x, Bw = as.numeric(y[2]), lower = as.numeric(y[2]),
upper = as.numeric(y[4])))}) %>%
bind_rows()
However, because apply converts the types, you'll need to use as.numeric.

Function do ({}) in R does not allow me to name "ode" to be able to add the variables inside ggplot mapings

I am implementing the SIR model in R, and I need to vary beta and gamma for it.
library(deSolve)
par(mar = rep(2, 4))
N = 1000
vi <- c(S = N-1,I = 1,R = 0)
SIR <- function(t, vi, pm) {
with(as.list(c(vi, pm)), {
ds <- -beta* S* (I/N)
di <- beta* S* (I/N) - gamma * I
dr <- gamma * I
return(list(c(ds, di, dr)))
})
}
t <- seq(0, 50, by = 1)
betavals <- c(1,5,8)
ipvals <- c(2,20,50)
gammavals <- 1/ipvals
However, when wanting to apply the function for my different Beta and gamma values, the do ({}) function does not allow me to name my function "ode" and thus be able to print in ggplot (aes (x = t, y = value ) both I, S and R.
library(tidyverse)
expand.grid(beta=betavals,gamma=gammavals)%>%
group_by(beta,gamma) %>%
do(
{
ode(func=SIR,y=vi,times=t,
parms=c(beta=.$beta,gamma=.$gamma)) %>%
as.data.frame() -> out
}
) out %>%
gather(variable,value,-time)%>%
ggplot(aes(x=time,y=value,color=variable))+ #value is I,S,R
geom_line()+
facet_grid(beta~gamma,scales='free_y',labeller=label_both)+
theme_bw()
When doing so I get this error
<Error: unexpected symbol in:
" }
) out">
You do not have access to the out variable outside the do function. We can continue using the same chain operation to get data in the long format. gather has been retired, so I replace it with pivot_longer.
library(tidyverse)
library(deSolve)
expand.grid(beta=betavals,gamma=gammavals)%>%
group_by(beta,gamma) %>%
do(
{
ode(func=SIR,y=vi,times=t,
parms=c(beta=.$beta,gamma=.$gamma)) %>%
as.data.frame()
}
) %>%
ungroup %>%
pivot_longer(cols = S:R) %>%
mutate(name = factor(name, c('S', 'I', 'R'))) %>%
ggplot(aes(x=time,y=value,color=name))+
geom_line() +
facet_grid(beta~gamma,scales='free_y',labeller=label_both)+
theme_bw()

rgenoud - How to pass parameters to the function?

I have a function that currently plays nice with rgenoud. It has one parameter (xx) and rgenoud will optimize xx perfectly.
However, I would like to add a second parameter to my function that wouldnt be optimized by rgendoud . For example, I would like my function to either fit a model with a gaussian link or a poisson link and to specify that when I call rgenoud.
Any idea?
thanks
edit: here is a minimal working example of what I mean. How would you get the last line to work?
adstock reflect the fact that TV advertising should have an impact on the number of quotes of future weeks.
Adstock[t] = Ads[t] + rate* Ads[t-1] + rate^2*Ads[t-2] + .... + rate^max_memory * Ads[t-max_memory]
We want rgenoud to figure out what rate and max_memory will return the model with the best fit. Best fit is defined as the lowest RMSE.
set.seed(107)
library(fpp)
library(rgenoud)
adstock_k <- function(x, adstock_rate = 0, max_memory = 12){
learn_rates <- rep(adstock_rate, max_memory+1) ^ c(0:max_memory)
adstocked_advertising <- stats::filter(c(rep(0, max_memory), x), learn_rates, method="convolution")
adstocked_advertising <- adstocked_advertising[!is.na(adstocked_advertising)]
return(as.numeric(adstocked_advertising))
}
getRMSE <- function(x, y) {
mean((x-y)^2) %>% sqrt
}
df <- data.frame(insurance) %>%
mutate(Quotes = round (Quotes*1000, digits = 0 ))
df$idu <- as.numeric(rownames(df))
my_f <- function(xx){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
domaine <- cbind(c(30,1), c(85, 8))
#this works
min_f <- genoud(my_f, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
#here I try to add a second parameter to the function.
my_f2 <- function(xx,first_n_weeks=20){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
filter(idu<= first_n_weeks) %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
#this doesnt work
min_f2 <- genoud(my_f2(first_n_week=10), nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
Include the argument in the call to genoud, e.g.
genoud(my_f2, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T, first_n_weeks = 10)

Resources