Regression estimates by group with p.value stars - r

I run multiple lm regressions by group, using the following command
by_country <- group_by(df, country)
print(do(by_country,
tidy(lm(y ~ x*z, data=.))), n=500)
I added the print statement just to see the full table.
Now I would like to change the output to display stars to show the level of significance.
If I change the code to
df %>%
do(by_country,
tidy(lm(y ~ x*z, data=.))) %>%
mutate(signif = stars.pval(p.value))
I get the error:
Error: Can only supply one unnamed argument, not 2.
How can I combine the group by lm and the mutate? Or is there an alternative, more convenient, way?
I am looking for the following/similar output:
|Country|term|estimate|std.error|stars
|:---- |:------:| -----:|-----:|-----:|
| UK | x:z | ... | ... | ***|

do has been superseded. This code seems to work fine with default mtcars data.
library(dplyr)
df <- mtcars
df %>%
group_by(cyl) %>%
summarise(model = list(broom::tidy(lm(disp ~ mpg*am, data = cur_data())))) %>%
tidyr::unnest(model) %>%
mutate(signif = gtools::stars.pval(p.value))
# cyl term estimate std.error statistic p.value signif
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 4 (Intercept) -69.4 129. -0.537 0.608 " "
# 2 4 mpg 8.96 5.64 1.59 0.156 " "
# 3 4 am 270. 132. 2.04 0.0806 "."
# 4 4 mpg:am -12.8 5.73 -2.23 0.0609 "."
# 5 6 (Intercept) -147. 225. -0.653 0.561 " "
# 6 6 mpg 18.4 11.7 1.56 0.216 " "
# 7 6 am 64.7 682. 0.0949 0.930 " "
# 8 6 mpg:am -6.84 33.4 -0.205 0.851 " "
# 9 8 (Intercept) 566. 106. 5.36 0.000320 "***"
#10 8 mpg -13.9 6.91 -2.00 0.0729 "."
#11 8 am -1203. 1736. -0.693 0.504 " "
#12 8 mpg:am 76.4 113. 0.678 0.513 " "

Example data:
set.seed(111)
df = data.frame(country = sample(c("A","B"),100,replace=TRUE),
x=runif(100),y=runif(100),z=runif(100))
You don't need to pass the data.frame:
by_country <- group_by(df, country)
do(by_country,
tidy(lm(y ~ x*z, data=.))) %>%
mutate(signif = stars.pval(p.value))
Or:
df %>%
group_by(country) %>%
do(tidy(lm(y ~ x*z, data=.))) %>%
mutate(signif = stars.pval(p.value))
Both gives:
# A tibble: 8 x 7
# Groups: country [2]
country term estimate std.error statistic p.value signif
<fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 A (Intercept) 0.808 0.154 5.25 0.00000324 "***"
2 A x -0.552 0.307 -1.80 0.0780 "."
3 A z -0.491 0.326 -1.50 0.139 " "
4 A x:z 0.853 0.552 1.55 0.129 " "
5 B (Intercept) 0.0658 0.194 0.339 0.736 " "
6 B x 0.503 0.300 1.68 0.101 " "
7 B z 0.866 0.400 2.17 0.0360 "*"
8 B x:z -0.857 0.609 -1.41 0.167 " "

Related

Run linear regression model on nested dataset after grouping (multiply imputed dataset)

*I want to group nested (multiply imputed) dataset and then apply linear regression on each dataset. I have tried a number of approaches, including the map options (2) and the for loop (3). I have had no luck at all. I want the model results to look like results from summary(mod1). Does anyone know what I could be doing wrong?
# get dependencies
library(mice)
library(tidyverse)
# impute the boys dataset from mice package
boys_imp <- mice(boys)
# 1) I want to run a model like this on my multiply imputed dataset
mod <- boys %>%
group_by(reg) %>%
do(tidy(
lm(
data=.,
formula = wgt ~ bmi),
conf.int = T))
summary(mod1)
# A tibble: 12 × 8
# Groups: reg [6]
reg term estimate std.error statistic p.value conf.low conf.high
<fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 north (Intercept) -81.9 9.84 -8.32 2.48e-12 -101. -62.3
2 north bmi 6.84 0.500 13.7 2.53e-22 5.85 7.84
3 east (Intercept) -75.3 7.62 -9.89 3.21e-18 -90.4 -60.3
4 east bmi 6.29 0.420 15.0 4.53e-32 5.46 7.12
5 west (Intercept) -91.9 6.31 -14.6 2.48e-34 -104. -79.4
6 west bmi 7.17 0.347 20.7 3.49e-54 6.49 7.86
7 south (Intercept) -79.8 6.73 -11.9 1.83e-24 -93.1 -66.5
8 south bmi 6.47 0.373 17.3 1.63e-40 5.73 7.20
9 city (Intercept) -92.0 13.9 -6.61 6.75e- 9 -120. -64.2
10 city bmi 6.95 0.757 9.18 1.39e-13 5.44 8.46
11 NA (Intercept) -88.6 43.8 -2.02 2.92e- 1 -645. 468.
12 NA bmi 6.46 2.89 2.24 2.68e- 1 -30.2 43.1
# 2) the map way --------------------------------------------------------
mod_imp <- boys_imp %>%
mice::complete("all") %>%
map(group_by, reg) %>%
map(lm, formula = wgt ~ bmi) %>%
pool()
summary(mod_imp)
term estimate std.error statistic df p.value
1 (Intercept) -85.473428 3.5511961 -24.06891 715.1703 0
2 bmi 6.793622 0.1945322 34.92287 693.7835 0
# 3) for loop way-------------------------------------------------------
# nest the mids dataset
boys_imp2 <- boys_imp %>%
mice::complete("all")
dat1 <- replicate(length(boys_imp2), NULL) # preallocate same size
# run the for loop
for (i in seq_along(boys_imp2)) {
dat1[[i]] <- boys_imp2[[i]] %>%
group_by(reg) %>%
do(lm(wgt ~ bmi, data = boys_imp2[[i]]))
}
|==================================================================|100% ~0 s remaining Error in `do()`:
! Results 1, 2, 3, 4, 5, ... must be data frames, not lm.
Run `rlang::last_error()` to see where the error occurred.*
I have found a solution to the problem. This involve grouping the data by ID and variable of interest, subsequently I map lm on to the datasets. I then finish off with unnesting the data
boys_imp %>%
mice::complete("long", include = FALSE) %>%
group_by(.imp, reg) %>%
nest() %>%
mutate(lm_model = map(data, ~lm(bmi ~ phb, data = .))) %>%
group_by(reg) %>%
summarise(model = list(tidy(pool(lm_model),conf.int = T))) %>%
unnest_wider(model) %>%
unnest(cols = c(term, estimate, std.error,
statistic, p.value, conf.low, conf.high))
# A tibble: 30 × 16
reg term estimate std.error statistic p.value conf.low conf.high b df dfcom fmi lambda m riv ubar
<fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 north (Intercept) 19.3 0.332 57.9 0 18.6 19.9
2 north phb.L 5.10 0.678 7.53 1.81e-10 3.75 6.46
3 north phb.Q 1.25 0.800 1.56 1.24e- 1 -0.357 2.86
4 north phb.C -0.430 0.882 -0.487 6.30e- 1 -2.25 1.39
5 north phb^4 -1.10 0.948 -1.16 2.57e- 1 -3.07 0.862
6 north phb^5 -0.156 1.08 -0.144 8.87e- 1 -2.41 2.10
7 east (Intercept) 18.7 0.244 76.8 0 18.3 19.2
8 east phb.L 4.83 0.509 9.48 4.44e-15 3.82 5.84
9 east phb.Q 1.10 0.692 1.60 1.27e- 1 -0.343 2.55
10 east phb.C -0.518 0.671 -0.772 4.49e- 1 -1.91 0.878
# … with 20 more rows
# ℹ Use `print(n = ...)` to see more rows

Loop on several variables with the same suffix in R

I have a database which looks like this but with much more rows and columns.
Several variables (x,y,z) measured at different time (1,2,3).
df <-
tibble(
x1 = rnorm(10),
x2 = rnorm(10),
x3 = rnorm(10),
y1 = rnorm(10),
y2 = rnorm(10),
y3 = rnorm(10),
z1 = rnorm(10),
z2 = rnorm(10),
z3 = rnorm(10),
)
I am trying to create dummies variables from the variables with the same suffix (measured at the same time) like this:
df <- df %>%
mutate(var1= ifelse(x1>0 & (y1<0.5 |z1<0.5),0,1)) %>%
mutate(var2= ifelse(x2>0 & (y2<0.5 |z2<0.5),0,1)) %>%
mutate(var3= ifelse(x3>0 & (y1<0.5 |z3<0.5),0,1))
I am used to coding in SAS or Stata, so I would like to use a function or a loop because I have many more variables in my database.
But I think I don't have the right approach in R to deal with this.
Thank you very much for your help !
{dplyover} makes this kind of operation easy (disclaimer: I'm the maintainer), given that your desired output contains a typo:
I think you want to use all variables with the same digit (1, 2, 3 and so on) in each calculation:
df <- df %>%
mutate(var1= ifelse(x1>0 & (y1<0.5 |z1<0.5),0,1)) %>%
mutate(var2= ifelse(x2>0 & (y2<0.5 |z2<0.5),0,1)) %>%
mutate(var3= ifelse(x3>0 & (y3<0.5 |z3<0.5),0,1))
If that is the case we can use dplyover::over to apply the same function over a vector. Here we construct the vector with extract_names("[0-9]{1}$") which gets us all ending numbers of our variable names here: c(1,2,3). We can then construct the variable names using a special syntax: .("x{.x}"). Here .x evaluates to the first number in our vector so it would return the object name x1 (not a string!) which we can use inside the function argument of over.
library(dplyr)
library(dplyover) # Only on GitHub: https://github.com/TimTeaFan/dplyover
df %>%
mutate(over(cut_names("^[a-z]{1}"),
~ ifelse(.("x{.x}") > 0 & (.("y{.x}") < 0.5 | .("z{.x}") < 0.5), 0, 1),
.names = "var{x}"
))
#> # A tibble: 10 x 12
#> x1 x2 x3 y1 y2 y3 z1 z2 z3 var1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.690 0.550 0.911 0.203 -0.111 0.530 -2.09 0.189 0.147 0
#> 2 -0.238 1.32 -0.145 0.744 1.05 -0.448 2.05 -1.04 1.50 1
#> 3 0.888 0.898 -1.46 -1.87 -1.14 1.59 1.91 -0.155 1.46 0
#> 4 -2.78 -1.34 -0.486 -0.0674 0.246 0.141 0.154 1.08 -0.319 1
#> 5 -1.20 0.835 1.28 -1.32 -0.674 0.115 0.362 1.06 0.515 1
#> 6 0.622 -0.713 0.0525 1.79 -0.427 0.819 -1.53 -0.885 0.00237 0
#> 7 -2.54 0.0197 0.942 0.230 -1.37 -1.02 -1.55 -0.721 -1.06 1
#> 8 -0.434 1.97 -0.274 0.848 -0.482 -0.422 0.197 0.497 -0.600 1
#> 9 -0.316 -0.219 0.467 -1.97 -0.718 -0.442 -1.39 -0.877 1.52 1
#> 10 -1.03 0.226 2.04 0.432 -1.02 -0.535 0.954 -1.11 0.804 1
#> # ... with 2 more variables: var2 <dbl>, var3 <dbl>
Alternatively we can use dplyr::across and use cur_column(), get() and gsub() to alter the name of the column on the fly. To name the new variables correctly we use gsub() in the .names argument of across and wrap it in curly braces {} to evaluate the expression.
library(dplyr)
df %>%
mutate(across(starts_with("x"),
~ {
cur_c <- dplyr::cur_column()
ifelse(.x > 0 & (get(gsub("x","y", cur_c)) < 0.5 | get(gsub("x","z", cur_c)) < 0.5), 0, 1)
},
.names = '{gsub("x", "var", .col)}'
))
#> # A tibble: 10 x 12
#> x1 x2 x3 y1 y2 y3 z1 z2 z3 var1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.423 -1.42 -1.15 -1.54 1.92 -0.511 -0.739 0.501 0.451 1
#> 2 -0.358 0.164 0.971 -1.61 1.96 -0.675 -0.0188 -1.88 1.63 1
#> 3 -0.453 -0.758 -0.258 -0.449 -0.795 -0.362 -1.81 -0.780 -1.90 1
#> 4 0.855 0.335 -1.36 0.796 -0.674 -1.37 -1.42 -1.03 -0.560 0
#> 5 0.436 -0.0487 -0.639 0.352 -0.325 -0.893 -0.746 0.0548 -0.394 0
#> 6 -0.228 -0.240 -0.854 -0.197 0.884 0.118 -0.0713 1.09 -0.0289 1
#> 7 -0.949 -0.231 0.428 0.290 -0.803 2.15 -1.11 -0.202 -1.21 1
#> 8 1.88 -0.0980 -2.60 -1.86 -0.0258 -0.965 -1.52 -0.539 0.108 0
#> 9 0.221 1.58 -1.46 -0.806 0.749 0.506 1.09 0.523 1.86 0
#> 10 0.0238 -0.389 -0.474 0.512 -0.448 0.178 0.529 1.56 -1.12 1
#> # ... with 2 more variables: var2 <dbl>, var3 <dbl>
Created on 2022-06-08 by the reprex package (v2.0.1)
You could restructure your data along the principles of tidy data (see e.g. https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html).
Here to a long format and using tidyverse:
library(tidyverse)
df <-
df |>
pivot_longer(everything()) |>
separate(name, c("var", "time"), sep = "(?=[0-9])") |>
pivot_wider(id_col = "time",
names_from = "var",
names_prefix = "var_",
values_from = "value",
values_fn = list) |>
unnest(-time) |>
mutate(new_var = ifelse(var_x > 0 & (var_y < 0.5 | var_z < 0.5), 0, 1))
df
You would probably want to keep the data in a long format, but if you want, you can pivot_wider and get back to the format you started with. E.g.
df |>
pivot_wider(values_from = c(starts_with("var_"), "new_var"),
names_from = "time",
values_fn = list) |>
unnest(everything())
As you suggested, a solution using a loop is definitely possible.
# times as unique non-alphabetical parts of column names
times <- unique(gsub('[[:alpha:]]', '', names(df)))
for (time in times) {
# column names for current time
xyz <- paste0(c('x', 'y', 'z'), time)
df[[paste0('var', time)]] <-
ifelse(df[[xyz[1]]]>0 & (df[[xyz[2]]]<.5 | df[[xyz[3]]]<.5), 0, 1)
}
Another way I can think of is transforming the data into a 3D array (observartion × variable × time) so that you can actually do the computation for all variables at once.
times <- unique(gsub('[[:alpha:]]', '', names(df)))
df.arr <- sapply(c('x', 'y', 'z'),
function(var) as.matrix(df[, paste0(var, times)]),
simplify='array')
new.vars <- ifelse(df.arr[, , 1]>0 & (df.arr[, , 2]<0.5 | df.arr[, , 3]<0.5), 0, 1)
colnames(new.vars) <- paste0('var', times)
cbind(df, new.vars)
Here, sapply creates a matrix from columns of measurings for each variable at different times and stacks them into a 3D array.
If you trust (or ensure) correct ordering of columns in the data frame, instead of using sapply you can create the array just by modifying the object's dimensions. I didn't do any benchmarking but i guess this could be the most computationally efficient solution (if it should matter).
df.arr <- as.matrix(df)
dim(df.arr) <- c(dim(df.arr) / c(1, 3), 3)

Loop in Cox regression

I am trying to run a cox regression for 1000 variables (exposure) as below
varlist <- names(dataset)[275:1275]
sumtables <- lapply(varlist, function(i) {
iformula <- as.formula(sprintf("Surv(time_cox, events) ~ %s + age +age2 ", i))
x <- coxph(iformula, data=dataset, na.action=na.omit)
summary(x)[7][[1]] ##### summary(x)[8][[1]]
})
it works well, but I don't know how to extract the data (for each variable (beta and se)) and run the benjamini-hochberg on p-values.
any help is appreciated! Thanks
I am assuming here that all the variables in varlist are either binary or numeric.
sumtables <- lapply(varlist, function(i) {
iformula <- as.formula(sprintf("Surv(time_cox, events) ~ %s + age +age2 ", i))
x <- coxph(iformula, data=dataset, na.action=na.omit)
data.frame(pvalue = drop1(x, scope = i, test = "Chisq")[2,4],
coef = coef(x)[i])
})
You could use purrr::map to get a tidy dataframe of all your coefficients, se's and p values etc. from the vector of tested exposures. Modifying a little from your code above to work with veteran dataset:
library(survival)
library(tidyverse)
exp_vars <- names(veteran[, c(1, 2, 5, 6, 8)])
tibble(exp_vars) %>%
group_by(exp_vars) %>%
mutate(cox_mod = map(exp_vars, function(exposure) {
iformula <-
as.formula(sprintf("Surv(time, status) ~ %s + age", exposure))
x <- coxph(iformula, data = veteran, na.action = na.omit)
x
}),
coefs = list(rownames_to_column(data.frame(
summary(cox_mod[[1]])$coefficients
)))) %>%
unnest(coefs)
#> # A tibble: 12 x 8
#> # Groups: exp_vars [5]
#> exp_vars cox_mod rowname coef exp.coef. se.coef. z Pr...z..
#> <chr> <list> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 trt <coxph> trt -0.00365 0.996 0.183 -0.0200 9.84e- 1
#> 2 trt <coxph> age 0.00753 1.01 0.00966 0.779 4.36e- 1
#> 3 celltype <coxph> celltypesmallc~ 0.992 2.70 0.254 3.91 9.40e- 5
#> 4 celltype <coxph> celltypeadeno 1.16 3.17 0.293 3.94 8.07e- 5
#> 5 celltype <coxph> celltypelarge 0.235 1.27 0.278 0.848 3.97e- 1
#> 6 celltype <coxph> age 0.00590 1.01 0.00935 0.631 5.28e- 1
#> 7 karno <coxph> karno -0.0337 0.967 0.00520 -6.48 8.94e-11
#> 8 karno <coxph> age -0.00239 0.998 0.00908 -0.263 7.92e- 1
#> 9 diagtime <coxph> diagtime 0.00943 1.01 0.00892 1.06 2.90e- 1
#> 10 diagtime <coxph> age 0.00797 1.01 0.00961 0.830 4.07e- 1
#> 11 prior <coxph> prior -0.0135 0.987 0.0201 -0.674 5.00e- 1
#> 12 prior <coxph> age 0.00715 1.01 0.00955 0.749 4.54e- 1
Created on 2022-03-16 by the reprex package (v2.0.1)

Many regressions using tidyverse and broom: Same dependent variable, different independent variables

This link shows how to answer my question in the case where we have the same independent variables, but potentially many different dependent variables: Use broom and tidyverse to run regressions on different dependent variables.
But my question is, how can I apply the same approach (e.g., tidyverse and broom) to run many regressions where we have the reverse situation: same dependent variables but different independent variable. In line with the code in the previous link, something like:
mod = lm(health ~ cbind(sex,income,happiness) + faculty, ds) %>% tidy()
However, this code does not do exactly what I want, and instead, produces:
Call:
lm(formula = income ~ cbind(sex, health) + faculty, data = ds)
Coefficients:
(Intercept) cbind(sex, health)sex
945.049 -47.911
cbind(sex, health)health faculty
2.342 1.869
which is equivalent to:
lm(formula = income ~ sex + health + faculty, data = ds)
Basically you'll need some way to create all the different formulas you want. Here's one way
qq <- expression(sex,income,happiness)
formulae <- lapply(qq, function(v) bquote(health~.(v)+faculty))
# [[1]]
# health ~ sex + faculty
# [[2]]
# health ~ income + faculty
# [[3]]
# health ~ happiness + faculty
Once you have all your formula, you can map them to lm and then to tidy()
library(purrr)
library(broom)
formulae %>% map(~lm(.x, ds)) %>% map_dfr(tidy, .id="model")
# A tibble: 9 x 6
# model term estimate std.error statistic p.value
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 (Intercept) 19.5 0.504 38.6 1.13e-60
# 2 1 sex 0.755 0.651 1.16 2.49e- 1
# 3 1 faculty -0.00360 0.291 -0.0124 9.90e- 1
# 4 2 (Intercept) 19.8 1.70 11.7 3.18e-20
# 5 2 income -0.000244 0.00162 -0.150 8.81e- 1
# 6 2 faculty 0.143 0.264 0.542 5.89e- 1
# 7 3 (Intercept) 18.4 1.88 9.74 4.79e-16
# 8 3 happiness 0.205 0.299 0.684 4.96e- 1
# 9 3 faculty 0.141 0.262 0.539 5.91e- 1
Using sample data
set.seed(11)
ds <- data.frame(income = rnorm(100, mean=1000,sd=200),
happiness = rnorm(100, mean = 6, sd=1),
health = rnorm(100, mean=20, sd = 3),
sex = c(0,1),
faculty = c(0,1,2,3))
You could use the combn function to get all combinations of n independent variables and then iterate over them. Let's say n=3 here:
library(tidyverse)
ds <- data.frame(income = rnorm(100, mean=1000,sd=200),
happiness = rnorm(100, mean = 6, sd=1),
health = rnorm(100, mean=20, sd = 3),
sex = c(0,1),
faculty = c(0,1,2,3))
ivs = combn(names(ds)[names(ds)!="income"], 3, simplify=FALSE)
# Or, to get all models with 1 to 4 variables:
# ivs = map(1:4, ~combn(names(ds)[names(ds)!="income"], .x, simplify=FALSE)) %>%
# flatten()
names(ivs) = map(ivs, ~paste(.x, collapse="-"))
models = map(ivs,
~lm(as.formula(paste("income ~", paste(.x, collapse="+"))), data=ds))
map_df(models, broom::tidy, .id="model")
model term estimate std.error statistic p.value
* <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 happiness-health-sex (Intercept) 1086. 201. 5.39 5.00e- 7
2 happiness-health-sex happiness -25.4 21.4 -1.19 2.38e- 1
3 happiness-health-sex health 3.58 6.99 0.512 6.10e- 1
4 happiness-health-sex sex 11.5 41.5 0.277 7.82e- 1
5 happiness-health-faculty (Intercept) 1085. 197. 5.50 3.12e- 7
6 happiness-health-faculty happiness -25.8 20.9 -1.23 2.21e- 1
7 happiness-health-faculty health 3.45 6.98 0.494 6.23e- 1
8 happiness-health-faculty faculty 7.86 18.2 0.432 6.67e- 1
9 happiness-sex-faculty (Intercept) 1153. 141. 8.21 1.04e-12
10 happiness-sex-faculty happiness -25.9 21.4 -1.21 2.28e- 1
11 happiness-sex-faculty sex 3.44 46.2 0.0744 9.41e- 1
12 happiness-sex-faculty faculty 7.40 20.2 0.366 7.15e- 1
13 health-sex-faculty (Intercept) 911. 143. 6.35 7.06e- 9
14 health-sex-faculty health 3.90 7.03 0.554 5.81e- 1
15 health-sex-faculty sex 15.6 45.6 0.343 7.32e- 1
16 health-sex-faculty faculty 7.02 20.4 0.345 7.31e- 1

calculating qchisq in on a sparklyr tbl

I need to use the qchisq function on a column of a sparklyr data frame.
The problem is that it seems that qchisq function is not implemented in Spark. If I am reading the error message below correctly, sparklyr tried execute a function called "QCHISQ", however this doesn't exist neither in Hive SQL, nor in Spark.
In general, is there a way to run arbitrary functions that are not implemented in Hive or Spark, with sparklyr? I know about spark_apply, but haven't figured out how to configure it yet.
> mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1))
> mydf_tbl = copy_to(con, mydf)
> mydf_tbl
# Source: table<mydf> [?? x 2]
# Database: spark_connection
beta pval
<dbl> <dbl>
1 3.42 0.0913
2 -1.72 0.0629
3 0.515 0.0335
4 -3.12 0.0717
5 -2.12 0.0253
6 1.36 0.00640
7 -3.33 0.0896
8 1.36 0.0235
9 0.619 0.0414
10 4.73 0.0416
> mydf_tbl %>% mutate(se = sqrt(beta^2/qchisq(pval)))
Error: org.apache.spark.sql.AnalysisException: Undefined function: 'QCHISQ'.
This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 49
As you noted you can use spark_apply:
mydf_tbl %>%
spark_apply(function(df)
dplyr::mutate(df, se = sqrt(beta^2/qchisq(pval, df = 12))))
# # Source: table<sparklyr_tmp_14bd5feacf5> [?? x 3]
# # Database: spark_connection
# beta pval X3
# <dbl> <dbl> <dbl>
# 1 1.66 0.0763 0.686
# 2 0.153 0.0872 0.0623
# 3 2.96 0.0485 1.30
# 4 4.86 0.0349 2.22
# 5 -1.82 0.0712 0.760
# 6 2.34 0.0295 1.10
# 7 3.54 0.0297 1.65
# 8 4.57 0.0784 1.88
# 9 4.94 0.0394 2.23
# 10 -0.610 0.0906 0.246
# # ... with more rows
but fair warning - it is embarrassingly slow. Unfortunately you don't have alternative here, short of writing your own Scala / Java extensions.
In the end I've used an horrible hack, which for this case works fine.
Another solution would have been to write a User Defined Function (UDF), but sparklyr doesn't support it yet: https://github.com/rstudio/sparklyr/issues/1052
This is the hack I've used. In short, I precompute a qchisq table, upload it as a sparklyr object, then join. If I compare this with results calculated on a local data frame, I get a correlation of r=0.99999990902236146617.
#' #param n: number of significant digits to use
> check_precomputed_strategy = function(n) {
chisq = data.frame(pval=seq(0, 1, 1/(10**(n)))) %>%
mutate(qval=qchisq(pval, df=1, lower.tail = FALSE)) %>%
mutate(pval_s = as.character(round(as.integer(pval*10**n),0)))
chisq %>% head %>% print
chisq_tbl = copy_to(con, chisq, overwrite=T)
mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1)) %>%
mutate(se1 = sqrt(beta^2/qchisq(pval, df=1, lower.tail = FALSE)))
mydf_tbl = copy_to(con, mydf)
mydf_tbl.up = mydf_tbl %>%
mutate(pval_s=as.character(round(as.integer(pval*10**n),0))) %>%
left_join(chisq_tbl, by="pval_s") %>%
mutate(se=sqrt(beta^2 / qval)) %>%
collect %>%
filter(!duplicated(beta))
mydf_tbl.up %>% head %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% nrow %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% select(se, se1) %>% cor
}
> check_precomputed_strategy(4)
pval qval pval_s
1 0.00000000000000000000000 Inf 0
2 0.00010000000000000000479 15.136705226623396570 1
3 0.00020000000000000000958 13.831083619091122827 2
4 0.00030000000000000002793 13.070394140069462097 3
5 0.00040000000000000001917 12.532193305401813532 4
6 0.00050000000000000001041 12.115665146397173402 5
# A tibble: 6 x 8
beta pval.x se1 myvar pval_s pval.y qval se
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 3.42 0.0913 2.03 1. 912 0.0912 2.85 2.03
2 -1.72 0.0629 0.927 1. 628 0.0628 3.46 0.927
3 0.515 0.0335 0.242 1. 335 0.0335 4.52 0.242
4 -3.12 0.0717 1.73 1. 716 0.0716 3.25 1.73
5 -2.12 0.0253 0.947 1. 253 0.0253 5.00 0.946
6 1.36 0.00640 0.498 1. 63 0.00630 7.46 0.497
[1] 100
se se1
se 1.00000000000000000000 0.99999990902236146617
se1 0.99999990902236146617 1.00000000000000000000

Resources