I have several models fit to predict an outcome y = x1 + x2 + .....+x22. That's a fair number of predictors and a fair number of models. My customers want to know what's the marginal impact of each X on the estimated y. The models may include splines and interaction terms. I can do this, but it's cumbersome and requires loops or a lot of copy paste, which is slow or error prone. Can I do this better by writing my function differently and/or using purrr or an *apply function? Reproducible example is below. Ideally, I could write one function and apply it to longdata.
## create my fake data.
library(tidyverse)
library (rms)
ltrans<- function(l1){
newvar <- exp(l1)/(exp(l1)+1)
return(newvar)
}
set.seed(123)
mystates <- c("AL","AR","TN")
mydf <- data.frame(idno = seq(1:1500),state = rep(mystates,500))
mydf$x1[mydf$state=='AL'] <- rnorm(500,50,7)
mydf$x1[mydf$state=='AR'] <- rnorm(500,55,8)
mydf$x1[mydf$state=='TN'] <- rnorm(500,48,10)
mydf$x2 <- sample(1:5,500, replace = T)
mydf$x3 <- (abs(rnorm(1500,10,20)))^2
mydf$outcome <- as.numeric(cut2(sample(1:100,1500,replace = T),95))-1
dd<- datadist(mydf)
options(datadist = 'dd')
m1 <- lrm(outcome ~ x1 + x2+ rcs(x3,3), data = mydf)
dothemath <- function(x1 = x1ref,x2 = x2ref,x3 = x3ref) {
ltrans(-2.1802256-0.01114239*x1+0.050319692*x2-0.00079289232* x3+
7.6508189e-10*pmax(x3-7.4686271,0)^3-9.0897627e-10*pmax(x3- 217.97865,0)^3+
1.4389439e-10*pmax(x3-1337.2538,0)^3)}
x1ref <- 51.4
x2ref <- 3
x3ref <- 217.9
dothemath() ## 0.0591
mydf$referent <- dothemath()
mydf$thisobs <- dothemath(x1 = mydf$x1, x2 = mydf$x2, x3 = mydf$x3)
mydf$predicted <- predict(m1,mydf,type = "fitted.ind") ## yes, matches.
mydf$x1_marginaleffect <- dothemath(x1= mydf$x1)/mydf$referent
mydf$x2_marginaleffect <- dothemath(x2 = mydf$x2)/mydf$referent
mydf$x3_marginaleffect <- dothemath(x3 = mydf$x3)/mydf$referent
## can I do this with long data?
longdata <- mydf %>%
select(idno,state,referent,thisobs,x1,x2,x3) %>%
gather(varname,value,x1:x3)
##longdata$marginaleffect <- dothemath(longdata$varname = longdata$value) ## no, this does not work.
## I need to communicate to the function which variable it is evaluating.
longdata$marginaleffect[longdata$varname=="x1"] <- dothemath(x1 = longdata$value[longdata$varname=="x1"])/
longdata$referent[longdata$varname=="x1"]
longdata$marginaleffect[longdata$varname=="x2"] <- dothemath(x2 = longdata$value[longdata$varname=="x2"])/
longdata$referent[longdata$varname=="x2"]
longdata$marginaleffect[longdata$varname=="x3"] <- dothemath(x3 = longdata$value[longdata$varname=="x3"])/
longdata$referent[longdata$varname=="x3"]
testing<- inner_join(longdata[longdata$varname=="x1",c(1,7)],mydf[,c(1,10)])
head(testing) ## yes, both methods work.
Mostly you're just talking about a grouped mutate, with the caveat that dothemath is built such that you need to specify the variable name, which can be done by using do.call or purrr::invoke to call it on a named list of parameters:
longdata <- longdata %>%
group_by(varname) %>%
mutate(marginaleffect = invoke(dothemath, setNames(list(value), varname[1])) / referent)
longdata
#> # A tibble: 4,500 x 7
#> # Groups: varname [3]
#> idno state referent thisobs varname value marginaleffect
#> <int> <fct> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 1 AL 0.0591 0.0688 x1 46.1 1.06
#> 2 2 AR 0.0591 0.0516 x1 50.2 1.01
#> 3 3 TN 0.0591 0.0727 x1 38.0 1.15
#> 4 4 AL 0.0591 0.0667 x1 48.4 1.03
#> 5 5 AR 0.0591 0.0515 x1 47.1 1.05
#> 6 6 TN 0.0591 0.0484 x1 37.6 1.15
#> 7 7 AL 0.0591 0.0519 x1 60.9 0.905
#> 8 8 AR 0.0591 0.0531 x1 63.2 0.883
#> 9 9 TN 0.0591 0.0780 x1 47.8 1.04
#> 10 10 AL 0.0591 0.0575 x1 50.5 1.01
#> # ... with 4,490 more rows
# the first values look similar
inner_join(longdata[longdata$varname == "x1", c(1,7)], mydf[,c(1,10)])
#> Joining, by = "idno"
#> # A tibble: 1,500 x 3
#> idno marginaleffect x1_marginaleffect
#> <int> <dbl> <dbl>
#> 1 1 1.06 1.06
#> 2 2 1.01 1.01
#> 3 3 1.15 1.15
#> 4 4 1.03 1.03
#> 5 5 1.05 1.05
#> 6 6 1.15 1.15
#> 7 7 0.905 0.905
#> 8 8 0.883 0.883
#> 9 9 1.04 1.04
#> 10 10 1.01 1.01
#> # ... with 1,490 more rows
# check everything is the same
mydf %>%
gather(varname, marginaleffect, x1_marginaleffect:x3_marginaleffect) %>%
select(idno, varname, marginaleffect) %>%
mutate(varname = substr(varname, 1, 2)) %>%
all_equal(select(longdata, idno, varname, marginaleffect))
#> [1] TRUE
It may be easier to reconfigure dothemath to take an additional parameter of the variable name so as to avoid the gymnastics.
Related
I am trying to run a cox regression for 1000 variables (exposure) as below
varlist <- names(dataset)[275:1275]
sumtables <- lapply(varlist, function(i) {
iformula <- as.formula(sprintf("Surv(time_cox, events) ~ %s + age +age2 ", i))
x <- coxph(iformula, data=dataset, na.action=na.omit)
summary(x)[7][[1]] ##### summary(x)[8][[1]]
})
it works well, but I don't know how to extract the data (for each variable (beta and se)) and run the benjamini-hochberg on p-values.
any help is appreciated! Thanks
I am assuming here that all the variables in varlist are either binary or numeric.
sumtables <- lapply(varlist, function(i) {
iformula <- as.formula(sprintf("Surv(time_cox, events) ~ %s + age +age2 ", i))
x <- coxph(iformula, data=dataset, na.action=na.omit)
data.frame(pvalue = drop1(x, scope = i, test = "Chisq")[2,4],
coef = coef(x)[i])
})
You could use purrr::map to get a tidy dataframe of all your coefficients, se's and p values etc. from the vector of tested exposures. Modifying a little from your code above to work with veteran dataset:
library(survival)
library(tidyverse)
exp_vars <- names(veteran[, c(1, 2, 5, 6, 8)])
tibble(exp_vars) %>%
group_by(exp_vars) %>%
mutate(cox_mod = map(exp_vars, function(exposure) {
iformula <-
as.formula(sprintf("Surv(time, status) ~ %s + age", exposure))
x <- coxph(iformula, data = veteran, na.action = na.omit)
x
}),
coefs = list(rownames_to_column(data.frame(
summary(cox_mod[[1]])$coefficients
)))) %>%
unnest(coefs)
#> # A tibble: 12 x 8
#> # Groups: exp_vars [5]
#> exp_vars cox_mod rowname coef exp.coef. se.coef. z Pr...z..
#> <chr> <list> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 trt <coxph> trt -0.00365 0.996 0.183 -0.0200 9.84e- 1
#> 2 trt <coxph> age 0.00753 1.01 0.00966 0.779 4.36e- 1
#> 3 celltype <coxph> celltypesmallc~ 0.992 2.70 0.254 3.91 9.40e- 5
#> 4 celltype <coxph> celltypeadeno 1.16 3.17 0.293 3.94 8.07e- 5
#> 5 celltype <coxph> celltypelarge 0.235 1.27 0.278 0.848 3.97e- 1
#> 6 celltype <coxph> age 0.00590 1.01 0.00935 0.631 5.28e- 1
#> 7 karno <coxph> karno -0.0337 0.967 0.00520 -6.48 8.94e-11
#> 8 karno <coxph> age -0.00239 0.998 0.00908 -0.263 7.92e- 1
#> 9 diagtime <coxph> diagtime 0.00943 1.01 0.00892 1.06 2.90e- 1
#> 10 diagtime <coxph> age 0.00797 1.01 0.00961 0.830 4.07e- 1
#> 11 prior <coxph> prior -0.0135 0.987 0.0201 -0.674 5.00e- 1
#> 12 prior <coxph> age 0.00715 1.01 0.00955 0.749 4.54e- 1
Created on 2022-03-16 by the reprex package (v2.0.1)
BLUF: I am struggling to understand out how to use batching in the R targets package to improve performance in a static and dynamic branching pipeline processed in parallel using tar_make_future(). I presume that I need to batch within each dynamic branch but I am unsure how to go about doing that.
Here's a reprex that uses dynamic branching nested inside static branching, similar to what my actual pipeline is doing. It first branches statically for each value in all_types, and then dynamically branches within each category. This code produces 1,000 branches and 1,010 targets total. In the actual workflow I obviously don't use replicate, and the dynamic branches vary in number depending on the type value.
# _targets.r
library(targets)
library(tarchetypes)
library(future)
library(future.callr)
plan(callr)
all_types = data.frame(type = LETTERS[1:10])
tar_map(values = all_types, names = "type",
tar_target(
make_data,
replicate(100,
data.frame(x = seq(1000) + rnorm(1000, 0, 5),
y = seq(1000) + rnorm(1000, 20, 20)),
simplify = FALSE
),
iteration = "list"
),
tar_target(
fit_model,
lm(make_data),
pattern = map(make_data),
iteration = "list"
)
)
And here's a timing comparison of tar_make() vs tar_make_future() with eight workers:
# tar_destroy()
t1 <- system.time(tar_make())
# tar_destroy()
t2 <- system.time(tar_make_future(workers = 8))
rbind(serial = t1, parallel = t2)
## user.self sys.self elapsed user.child sys.child
## serial 2.12 0.11 25.59 NA NA
## parallel 2.07 0.24 184.68 NA NA
I don't think the user or system fields are useful here since the job gets dispatched to separate R processes, but the elapsed time for the parallel job takes about 7 times longer than the serial job.
I presume this slowdown is caused by the large number of targets. Will batching improve performance in this case, and if so how can I implement batching within the dynamic branch?
You are on the right track with batching. In your case, that is a matter of breaking up your list of 100 datasets into groups of, say, 10 or so. You could do this with a nested list of datasets, but that's a lot of work. Luckily, there is an easier way.
Your question is actually really well-timed. I just wrote some new target factories in tarchetypes that could help. To access them, you will need the development version of tarchetypes from GitHub:
remotes::install_github("ropensci/tarchetypes")
Then, with tar_map2_count(), it will be much easier to batch your list of 100 datasets for each scenario.
library(targets)
tar_script({
library(broom)
library(targets)
library(tarchetypes)
library(tibble)
make_data <- function(n) {
datasets_per_batch <- replicate(
100,
tibble(
x = seq(n) + rnorm(n, 0, 5),
y = seq(n) + rnorm(n, 20, 20)
),
simplify = FALSE
)
tibble(dataset = datasets_per_batch, rep = seq_along(datasets_per_batch))
}
tar_map2_count(
name = model,
command1 = make_data(n = rows),
command2 = tidy(lm(y ~ x, data = dataset)), # Need dataset[[1]] in tarchetypes 0.4.0
values = data_frame(
scenario = LETTERS[seq_len(10)],
rows = seq(10, 100, length.out = 10)
),
columns2 = NULL,
batches = 10
)
})
tar_make(reporter = "silent")
#> Warning message:
#> `data_frame()` was deprecated in tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
tar_read(model)
#> # A tibble: 2,000 × 8
#> term estimate std.error statistic p.value scenario rows tar_group
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <int>
#> 1 (Intercept) 17.1 12.8 1.34 0.218 A 10 10
#> 2 x 1.39 1.35 1.03 0.333 A 10 10
#> 3 (Intercept) 6.42 14.0 0.459 0.658 A 10 10
#> 4 x 1.75 1.28 1.37 0.209 A 10 10
#> 5 (Intercept) 32.8 7.14 4.60 0.00176 A 10 10
#> 6 x -0.300 1.14 -0.263 0.799 A 10 10
#> 7 (Intercept) 29.7 3.24 9.18 0.0000160 A 10 10
#> 8 x 0.314 0.414 0.758 0.470 A 10 10
#> 9 (Intercept) 20.0 13.6 1.47 0.179 A 10 10
#> 10 x 1.23 1.77 0.698 0.505 A 10 10
#> # … with 1,990 more rows
Created on 2021-12-10 by the reprex package (v2.0.1)
There is also tar_map_rep(), which may be easier if all your datasets are randomly generated, but I am not sure if I am overfitting your use case.
library(targets)
tar_script({
library(broom)
library(targets)
library(tarchetypes)
library(tibble)
make_one_dataset <- function(n) {
tibble(
x = seq(n) + rnorm(n, 0, 5),
y = seq(n) + rnorm(n, 20, 20)
)
}
tar_map_rep(
name = model,
command = tidy(lm(y ~ x, data = make_one_dataset(n = rows))),
values = data_frame(
scenario = LETTERS[seq_len(10)],
rows = seq(10, 100, length.out = 10)
),
batches = 10,
reps = 10
)
})
tar_make(reporter = "silent")
#> Warning message:
#> `data_frame()` was deprecated in tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
tar_read(model)
#> # A tibble: 2,000 × 10
#> term estimate std.error statistic p.value scenario rows tar_batch tar_rep
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <int> <int>
#> 1 (Inter… 37.5 7.50 5.00 0.00105 A 10 1 1
#> 2 x -0.701 1.17 -0.601 0.564 A 10 1 1
#> 3 (Inter… 21.5 9.64 2.23 0.0567 A 10 1 2
#> 4 x -0.213 1.55 -0.138 0.894 A 10 1 2
#> 5 (Inter… 20.6 9.51 2.17 0.0620 A 10 1 3
#> 6 x 1.40 1.79 0.783 0.456 A 10 1 3
#> 7 (Inter… 11.6 11.2 1.04 0.329 A 10 1 4
#> 8 x 2.34 1.39 1.68 0.131 A 10 1 4
#> 9 (Inter… 26.8 9.16 2.93 0.0191 A 10 1 5
#> 10 x 0.288 1.10 0.262 0.800 A 10 1 5
#> # … with 1,990 more rows, and 1 more variable: tar_group <int>
Created on 2021-12-10 by the reprex package (v2.0.1)
Unfortunately, futures do come with overhead. Maybe it will be faster in your case if you try tar_make_clustermq()?
I have a df that looks something like this like this:
set.seed(42)
ID <- sample(1:30, 100, rep=T)
Trait <- sample(0:1, 100, rep=T)
Year <- sample(1992:1999, 100, rep=T)
df <- cbind(ID, Trait, Year)
df <- as.data.frame(df)
Where ID is an individual organism, trait is a presence/absence of a phenotype and Year is the year an observation was made.
I would like to model if trait is random between individuals, something like this
library(MCMCglmm)
m <- MCMCglmm(Trait ~ ID, random = ~ Year, data = df, family = "categorical")
Now, would like to shuffle the Trait column and run x permutations, to check if my observed mean and CI fall outside of what's expected from random.
I could do this with a for loop, but I'd rather use a tidyverse solution.
I've read that lapply is a bette(?) alternative, but I am struggling to find a specific enough walk-through that I can follow.
I'd appreciate any advice offered here.
Cheers!
Jamie
EDIT October 10th. Cleaned up the code and per comment below added the code to give you back a nice organized tibble\dataframe
### decide how many shuffles you want and name them
### in an orderly fashion for the output
shuffles <- 1:10
names(shuffles) <- paste0("shuffle_", shuffles)
library(MCMCglmm)
library(dplyr)
library(tibble)
library(purrr)
ddd <- purrr::map(shuffles,
~ df %>%
mutate(Trait = sample(Trait)) %>%
MCMCglmm(fixed = Trait ~ ID,
random = ~ Year,
data = .,
family = "categorical",
verbose = FALSE)) %>%
purrr::map( ~ tibble::as_tibble(summary(.x)$solutions, rownames = "model_term")) %>%
dplyr::bind_rows(., .id = 'shuffle')
ddd
#> # A tibble: 20 x 7
#> shuffle model_term post.mean `l-95% CI` `u-95% CI` eff.samp pMCMC
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 shuffle_1 (Intercept) 112. 6.39 233. 103. 0.016
#> 2 shuffle_1 ID -6.31 -13.5 -0.297 112. 0.014
#> 3 shuffle_2 (Intercept) 24.9 -72.5 133. 778. 0.526
#> 4 shuffle_2 ID -0.327 -6.33 5.33 849. 0.858
#> 5 shuffle_3 (Intercept) 4.39 -77.3 87.4 161. 0.876
#> 6 shuffle_3 ID 1.04 -3.84 5.99 121. 0.662
#> 7 shuffle_4 (Intercept) 7.71 -79.0 107. 418. 0.902
#> 8 shuffle_4 ID 0.899 -4.40 6.57 408. 0.694
#> 9 shuffle_5 (Intercept) 30.4 -62.4 144. 732. 0.51
#> 10 shuffle_5 ID -0.644 -6.61 4.94 970. 0.866
#> 11 shuffle_6 (Intercept) -45.5 -148. 42.7 208. 0.302
#> 12 shuffle_6 ID 4.73 -0.211 11.6 89.1 0.058
#> 13 shuffle_7 (Intercept) -16.2 -133. 85.9 108. 0.696
#> 14 shuffle_7 ID 2.47 -2.42 10.3 47.8 0.304
#> 15 shuffle_8 (Intercept) 0.568 0.549 0.581 6.60 0.001
#> 16 shuffle_8 ID -0.0185 -0.0197 -0.0168 2.96 0.001
#> 17 shuffle_9 (Intercept) -6.95 -112. 92.2 452. 0.886
#> 18 shuffle_9 ID 2.07 -3.30 8.95 370. 0.476
#> 19 shuffle_10 (Intercept) 43.8 -57.0 159. 775. 0.396
#> 20 shuffle_10 ID -1.36 -7.44 5.08 901. 0.62
Your original data
set.seed(42)
ID <- sample(1:30, 100, rep=T)
Trait <- sample(0:1, 100, rep=T)
Year <- sample(1992:1999, 100, rep=T)
df <- cbind(ID, Trait, Year)
df <- as.data.frame(df)
This link shows how to answer my question in the case where we have the same independent variables, but potentially many different dependent variables: Use broom and tidyverse to run regressions on different dependent variables.
But my question is, how can I apply the same approach (e.g., tidyverse and broom) to run many regressions where we have the reverse situation: same dependent variables but different independent variable. In line with the code in the previous link, something like:
mod = lm(health ~ cbind(sex,income,happiness) + faculty, ds) %>% tidy()
However, this code does not do exactly what I want, and instead, produces:
Call:
lm(formula = income ~ cbind(sex, health) + faculty, data = ds)
Coefficients:
(Intercept) cbind(sex, health)sex
945.049 -47.911
cbind(sex, health)health faculty
2.342 1.869
which is equivalent to:
lm(formula = income ~ sex + health + faculty, data = ds)
Basically you'll need some way to create all the different formulas you want. Here's one way
qq <- expression(sex,income,happiness)
formulae <- lapply(qq, function(v) bquote(health~.(v)+faculty))
# [[1]]
# health ~ sex + faculty
# [[2]]
# health ~ income + faculty
# [[3]]
# health ~ happiness + faculty
Once you have all your formula, you can map them to lm and then to tidy()
library(purrr)
library(broom)
formulae %>% map(~lm(.x, ds)) %>% map_dfr(tidy, .id="model")
# A tibble: 9 x 6
# model term estimate std.error statistic p.value
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 (Intercept) 19.5 0.504 38.6 1.13e-60
# 2 1 sex 0.755 0.651 1.16 2.49e- 1
# 3 1 faculty -0.00360 0.291 -0.0124 9.90e- 1
# 4 2 (Intercept) 19.8 1.70 11.7 3.18e-20
# 5 2 income -0.000244 0.00162 -0.150 8.81e- 1
# 6 2 faculty 0.143 0.264 0.542 5.89e- 1
# 7 3 (Intercept) 18.4 1.88 9.74 4.79e-16
# 8 3 happiness 0.205 0.299 0.684 4.96e- 1
# 9 3 faculty 0.141 0.262 0.539 5.91e- 1
Using sample data
set.seed(11)
ds <- data.frame(income = rnorm(100, mean=1000,sd=200),
happiness = rnorm(100, mean = 6, sd=1),
health = rnorm(100, mean=20, sd = 3),
sex = c(0,1),
faculty = c(0,1,2,3))
You could use the combn function to get all combinations of n independent variables and then iterate over them. Let's say n=3 here:
library(tidyverse)
ds <- data.frame(income = rnorm(100, mean=1000,sd=200),
happiness = rnorm(100, mean = 6, sd=1),
health = rnorm(100, mean=20, sd = 3),
sex = c(0,1),
faculty = c(0,1,2,3))
ivs = combn(names(ds)[names(ds)!="income"], 3, simplify=FALSE)
# Or, to get all models with 1 to 4 variables:
# ivs = map(1:4, ~combn(names(ds)[names(ds)!="income"], .x, simplify=FALSE)) %>%
# flatten()
names(ivs) = map(ivs, ~paste(.x, collapse="-"))
models = map(ivs,
~lm(as.formula(paste("income ~", paste(.x, collapse="+"))), data=ds))
map_df(models, broom::tidy, .id="model")
model term estimate std.error statistic p.value
* <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 happiness-health-sex (Intercept) 1086. 201. 5.39 5.00e- 7
2 happiness-health-sex happiness -25.4 21.4 -1.19 2.38e- 1
3 happiness-health-sex health 3.58 6.99 0.512 6.10e- 1
4 happiness-health-sex sex 11.5 41.5 0.277 7.82e- 1
5 happiness-health-faculty (Intercept) 1085. 197. 5.50 3.12e- 7
6 happiness-health-faculty happiness -25.8 20.9 -1.23 2.21e- 1
7 happiness-health-faculty health 3.45 6.98 0.494 6.23e- 1
8 happiness-health-faculty faculty 7.86 18.2 0.432 6.67e- 1
9 happiness-sex-faculty (Intercept) 1153. 141. 8.21 1.04e-12
10 happiness-sex-faculty happiness -25.9 21.4 -1.21 2.28e- 1
11 happiness-sex-faculty sex 3.44 46.2 0.0744 9.41e- 1
12 happiness-sex-faculty faculty 7.40 20.2 0.366 7.15e- 1
13 health-sex-faculty (Intercept) 911. 143. 6.35 7.06e- 9
14 health-sex-faculty health 3.90 7.03 0.554 5.81e- 1
15 health-sex-faculty sex 15.6 45.6 0.343 7.32e- 1
16 health-sex-faculty faculty 7.02 20.4 0.345 7.31e- 1
I need to use the qchisq function on a column of a sparklyr data frame.
The problem is that it seems that qchisq function is not implemented in Spark. If I am reading the error message below correctly, sparklyr tried execute a function called "QCHISQ", however this doesn't exist neither in Hive SQL, nor in Spark.
In general, is there a way to run arbitrary functions that are not implemented in Hive or Spark, with sparklyr? I know about spark_apply, but haven't figured out how to configure it yet.
> mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1))
> mydf_tbl = copy_to(con, mydf)
> mydf_tbl
# Source: table<mydf> [?? x 2]
# Database: spark_connection
beta pval
<dbl> <dbl>
1 3.42 0.0913
2 -1.72 0.0629
3 0.515 0.0335
4 -3.12 0.0717
5 -2.12 0.0253
6 1.36 0.00640
7 -3.33 0.0896
8 1.36 0.0235
9 0.619 0.0414
10 4.73 0.0416
> mydf_tbl %>% mutate(se = sqrt(beta^2/qchisq(pval)))
Error: org.apache.spark.sql.AnalysisException: Undefined function: 'QCHISQ'.
This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 49
As you noted you can use spark_apply:
mydf_tbl %>%
spark_apply(function(df)
dplyr::mutate(df, se = sqrt(beta^2/qchisq(pval, df = 12))))
# # Source: table<sparklyr_tmp_14bd5feacf5> [?? x 3]
# # Database: spark_connection
# beta pval X3
# <dbl> <dbl> <dbl>
# 1 1.66 0.0763 0.686
# 2 0.153 0.0872 0.0623
# 3 2.96 0.0485 1.30
# 4 4.86 0.0349 2.22
# 5 -1.82 0.0712 0.760
# 6 2.34 0.0295 1.10
# 7 3.54 0.0297 1.65
# 8 4.57 0.0784 1.88
# 9 4.94 0.0394 2.23
# 10 -0.610 0.0906 0.246
# # ... with more rows
but fair warning - it is embarrassingly slow. Unfortunately you don't have alternative here, short of writing your own Scala / Java extensions.
In the end I've used an horrible hack, which for this case works fine.
Another solution would have been to write a User Defined Function (UDF), but sparklyr doesn't support it yet: https://github.com/rstudio/sparklyr/issues/1052
This is the hack I've used. In short, I precompute a qchisq table, upload it as a sparklyr object, then join. If I compare this with results calculated on a local data frame, I get a correlation of r=0.99999990902236146617.
#' #param n: number of significant digits to use
> check_precomputed_strategy = function(n) {
chisq = data.frame(pval=seq(0, 1, 1/(10**(n)))) %>%
mutate(qval=qchisq(pval, df=1, lower.tail = FALSE)) %>%
mutate(pval_s = as.character(round(as.integer(pval*10**n),0)))
chisq %>% head %>% print
chisq_tbl = copy_to(con, chisq, overwrite=T)
mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1)) %>%
mutate(se1 = sqrt(beta^2/qchisq(pval, df=1, lower.tail = FALSE)))
mydf_tbl = copy_to(con, mydf)
mydf_tbl.up = mydf_tbl %>%
mutate(pval_s=as.character(round(as.integer(pval*10**n),0))) %>%
left_join(chisq_tbl, by="pval_s") %>%
mutate(se=sqrt(beta^2 / qval)) %>%
collect %>%
filter(!duplicated(beta))
mydf_tbl.up %>% head %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% nrow %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% select(se, se1) %>% cor
}
> check_precomputed_strategy(4)
pval qval pval_s
1 0.00000000000000000000000 Inf 0
2 0.00010000000000000000479 15.136705226623396570 1
3 0.00020000000000000000958 13.831083619091122827 2
4 0.00030000000000000002793 13.070394140069462097 3
5 0.00040000000000000001917 12.532193305401813532 4
6 0.00050000000000000001041 12.115665146397173402 5
# A tibble: 6 x 8
beta pval.x se1 myvar pval_s pval.y qval se
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 3.42 0.0913 2.03 1. 912 0.0912 2.85 2.03
2 -1.72 0.0629 0.927 1. 628 0.0628 3.46 0.927
3 0.515 0.0335 0.242 1. 335 0.0335 4.52 0.242
4 -3.12 0.0717 1.73 1. 716 0.0716 3.25 1.73
5 -2.12 0.0253 0.947 1. 253 0.0253 5.00 0.946
6 1.36 0.00640 0.498 1. 63 0.00630 7.46 0.497
[1] 100
se se1
se 1.00000000000000000000 0.99999990902236146617
se1 0.99999990902236146617 1.00000000000000000000