cutting interval based on limits in a list - r

I have the following data frame with 4 numeric columns:
df <- structure(list(a = c(0.494129340746821, 1.0182303327812, 0.412227511922328,
0.204436644926016, 0.707038309818134, -0.0547300783473556, 1.02124944293185,
0.381284586356091, 0.375197843213519, -1.18172401075089), b =
c(-1.34374367808722,
-0.724644569211516, -0.618107980582741, -1.79274868750102,
-3.03559838445132,
-0.205726144151615, -0.441511286334811, 0.126660637747845,
0.353737902975931,
-0.26601393471207), c = c(1.36922677098999, -1.81698348029464,
-0.846111260721092, 0.121256015837603, -1.16499681749603, 1.14145675696301,
-0.782988942359773, 3.25142254765012, -0.132099541183856, -0.242831877642412
), d = c(-0.30002630673509, -0.507496812070994, -2.59870853299723,
-1.30109828239028, 1.05029458887117, -0.606381379180569, -0.928822706709913,
-0.68324741261771, -1.17980245487707, 2.20174180936794)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I would like to create two new factor columns, in which I group columns 2 and 3 according to the values given in the list L:
ColsToChoose = c(2,3)
L = list()
L[[1]] = c(-0.3, 0.7)
L[[2]] = c(-1, 0.5, 1)
df %>% mutate_at(ColsToChoose, funs(intervals = cut(., c(-Inf, L[[.]], Inf))))
That is, I am expecting to get two new columns, the first called intervals_b indicating if the values of column b (column 2) are between -Inf and -0.3, -0.3 and 0.7 or 0.7 and Inf, and similarly for column c: -Inf to -1, -1 to 0.5, 0.5 to 1 and 1 to Inf.
I am getting an error:
Error in mutate_impl(.data, dots) :
Evaluation error: recursive indexing failed at level 2
I would like to do this for the general case, that's why I am using implicit names.
Any ideas?

You could do this base R mapply passing ColsToChoose of df and L parallely to get the range.
df[paste0("interval", names(df)[ColsToChoose])] <-
mapply(function(x, y) cut(x, c(-Inf, y, Inf)), df[ColsToChoose], L)
df
# a b c d intervalb intervalc
# <dbl> <dbl> <dbl> <dbl> <chr> <chr>
# 1 0.494 -1.34 1.37 -0.300 (-Inf,-0.3] (1, Inf]
# 2 1.02 -0.725 -1.82 -0.507 (-Inf,-0.3] (-Inf,-1]
# 3 0.412 -0.618 -0.846 -2.60 (-Inf,-0.3] (-1,0.5]
# 4 0.204 -1.79 0.121 -1.30 (-Inf,-0.3] (-1,0.5]
# 5 0.707 -3.04 -1.16 1.05 (-Inf,-0.3] (-Inf,-1]
# 6 -0.0547 -0.206 1.14 -0.606 (-0.3,0.7] (1, Inf]
# 7 1.02 -0.442 -0.783 -0.929 (-Inf,-0.3] (-1,0.5]
# 8 0.381 0.127 3.25 -0.683 (-0.3,0.7] (1, Inf]
# 9 0.375 0.354 -0.132 -1.18 (-0.3,0.7] (-1,0.5]
#10 -1.18 -0.266 -0.243 2.20 (-0.3,0.7] (-1,0.5]
A tidyverse approach using same approach
library(tidyverse)
bind_cols(df,
map2(df[ColsToChoose], L, ~ cut(.x, c(-Inf, .y, Inf))) %>%
data.frame() %>%
rename_all(paste0, "_interval"))
This gives same output as above.

Related

Loop on several variables with the same suffix in R

I have a database which looks like this but with much more rows and columns.
Several variables (x,y,z) measured at different time (1,2,3).
df <-
tibble(
x1 = rnorm(10),
x2 = rnorm(10),
x3 = rnorm(10),
y1 = rnorm(10),
y2 = rnorm(10),
y3 = rnorm(10),
z1 = rnorm(10),
z2 = rnorm(10),
z3 = rnorm(10),
)
I am trying to create dummies variables from the variables with the same suffix (measured at the same time) like this:
df <- df %>%
mutate(var1= ifelse(x1>0 & (y1<0.5 |z1<0.5),0,1)) %>%
mutate(var2= ifelse(x2>0 & (y2<0.5 |z2<0.5),0,1)) %>%
mutate(var3= ifelse(x3>0 & (y1<0.5 |z3<0.5),0,1))
I am used to coding in SAS or Stata, so I would like to use a function or a loop because I have many more variables in my database.
But I think I don't have the right approach in R to deal with this.
Thank you very much for your help !
{dplyover} makes this kind of operation easy (disclaimer: I'm the maintainer), given that your desired output contains a typo:
I think you want to use all variables with the same digit (1, 2, 3 and so on) in each calculation:
df <- df %>%
mutate(var1= ifelse(x1>0 & (y1<0.5 |z1<0.5),0,1)) %>%
mutate(var2= ifelse(x2>0 & (y2<0.5 |z2<0.5),0,1)) %>%
mutate(var3= ifelse(x3>0 & (y3<0.5 |z3<0.5),0,1))
If that is the case we can use dplyover::over to apply the same function over a vector. Here we construct the vector with extract_names("[0-9]{1}$") which gets us all ending numbers of our variable names here: c(1,2,3). We can then construct the variable names using a special syntax: .("x{.x}"). Here .x evaluates to the first number in our vector so it would return the object name x1 (not a string!) which we can use inside the function argument of over.
library(dplyr)
library(dplyover) # Only on GitHub: https://github.com/TimTeaFan/dplyover
df %>%
mutate(over(cut_names("^[a-z]{1}"),
~ ifelse(.("x{.x}") > 0 & (.("y{.x}") < 0.5 | .("z{.x}") < 0.5), 0, 1),
.names = "var{x}"
))
#> # A tibble: 10 x 12
#> x1 x2 x3 y1 y2 y3 z1 z2 z3 var1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.690 0.550 0.911 0.203 -0.111 0.530 -2.09 0.189 0.147 0
#> 2 -0.238 1.32 -0.145 0.744 1.05 -0.448 2.05 -1.04 1.50 1
#> 3 0.888 0.898 -1.46 -1.87 -1.14 1.59 1.91 -0.155 1.46 0
#> 4 -2.78 -1.34 -0.486 -0.0674 0.246 0.141 0.154 1.08 -0.319 1
#> 5 -1.20 0.835 1.28 -1.32 -0.674 0.115 0.362 1.06 0.515 1
#> 6 0.622 -0.713 0.0525 1.79 -0.427 0.819 -1.53 -0.885 0.00237 0
#> 7 -2.54 0.0197 0.942 0.230 -1.37 -1.02 -1.55 -0.721 -1.06 1
#> 8 -0.434 1.97 -0.274 0.848 -0.482 -0.422 0.197 0.497 -0.600 1
#> 9 -0.316 -0.219 0.467 -1.97 -0.718 -0.442 -1.39 -0.877 1.52 1
#> 10 -1.03 0.226 2.04 0.432 -1.02 -0.535 0.954 -1.11 0.804 1
#> # ... with 2 more variables: var2 <dbl>, var3 <dbl>
Alternatively we can use dplyr::across and use cur_column(), get() and gsub() to alter the name of the column on the fly. To name the new variables correctly we use gsub() in the .names argument of across and wrap it in curly braces {} to evaluate the expression.
library(dplyr)
df %>%
mutate(across(starts_with("x"),
~ {
cur_c <- dplyr::cur_column()
ifelse(.x > 0 & (get(gsub("x","y", cur_c)) < 0.5 | get(gsub("x","z", cur_c)) < 0.5), 0, 1)
},
.names = '{gsub("x", "var", .col)}'
))
#> # A tibble: 10 x 12
#> x1 x2 x3 y1 y2 y3 z1 z2 z3 var1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.423 -1.42 -1.15 -1.54 1.92 -0.511 -0.739 0.501 0.451 1
#> 2 -0.358 0.164 0.971 -1.61 1.96 -0.675 -0.0188 -1.88 1.63 1
#> 3 -0.453 -0.758 -0.258 -0.449 -0.795 -0.362 -1.81 -0.780 -1.90 1
#> 4 0.855 0.335 -1.36 0.796 -0.674 -1.37 -1.42 -1.03 -0.560 0
#> 5 0.436 -0.0487 -0.639 0.352 -0.325 -0.893 -0.746 0.0548 -0.394 0
#> 6 -0.228 -0.240 -0.854 -0.197 0.884 0.118 -0.0713 1.09 -0.0289 1
#> 7 -0.949 -0.231 0.428 0.290 -0.803 2.15 -1.11 -0.202 -1.21 1
#> 8 1.88 -0.0980 -2.60 -1.86 -0.0258 -0.965 -1.52 -0.539 0.108 0
#> 9 0.221 1.58 -1.46 -0.806 0.749 0.506 1.09 0.523 1.86 0
#> 10 0.0238 -0.389 -0.474 0.512 -0.448 0.178 0.529 1.56 -1.12 1
#> # ... with 2 more variables: var2 <dbl>, var3 <dbl>
Created on 2022-06-08 by the reprex package (v2.0.1)
You could restructure your data along the principles of tidy data (see e.g. https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html).
Here to a long format and using tidyverse:
library(tidyverse)
df <-
df |>
pivot_longer(everything()) |>
separate(name, c("var", "time"), sep = "(?=[0-9])") |>
pivot_wider(id_col = "time",
names_from = "var",
names_prefix = "var_",
values_from = "value",
values_fn = list) |>
unnest(-time) |>
mutate(new_var = ifelse(var_x > 0 & (var_y < 0.5 | var_z < 0.5), 0, 1))
df
You would probably want to keep the data in a long format, but if you want, you can pivot_wider and get back to the format you started with. E.g.
df |>
pivot_wider(values_from = c(starts_with("var_"), "new_var"),
names_from = "time",
values_fn = list) |>
unnest(everything())
As you suggested, a solution using a loop is definitely possible.
# times as unique non-alphabetical parts of column names
times <- unique(gsub('[[:alpha:]]', '', names(df)))
for (time in times) {
# column names for current time
xyz <- paste0(c('x', 'y', 'z'), time)
df[[paste0('var', time)]] <-
ifelse(df[[xyz[1]]]>0 & (df[[xyz[2]]]<.5 | df[[xyz[3]]]<.5), 0, 1)
}
Another way I can think of is transforming the data into a 3D array (observartion × variable × time) so that you can actually do the computation for all variables at once.
times <- unique(gsub('[[:alpha:]]', '', names(df)))
df.arr <- sapply(c('x', 'y', 'z'),
function(var) as.matrix(df[, paste0(var, times)]),
simplify='array')
new.vars <- ifelse(df.arr[, , 1]>0 & (df.arr[, , 2]<0.5 | df.arr[, , 3]<0.5), 0, 1)
colnames(new.vars) <- paste0('var', times)
cbind(df, new.vars)
Here, sapply creates a matrix from columns of measurings for each variable at different times and stacks them into a 3D array.
If you trust (or ensure) correct ordering of columns in the data frame, instead of using sapply you can create the array just by modifying the object's dimensions. I didn't do any benchmarking but i guess this could be the most computationally efficient solution (if it should matter).
df.arr <- as.matrix(df)
dim(df.arr) <- c(dim(df.arr) / c(1, 3), 3)

Using for loop to multiply variables and create product variables

I need to write a for loop to calculate the product of year variables (e.g. var1874) * price variables (e.g. num1874), creating a new variable for each year and its corresponding price value (e.g. newvar1874).
A tibble: 4 x 7
cty var1874 var1875 var1876 num1874 num1875 num1876
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.78 0.83 0.99 2.64 2.8 3.1
2 2 0.69 0.69 0.89 2.3 2.3 2.58
3 3 0.42 0.48 0.59 2.28 2.44 2.64
4 4 0.82 0.94 1.09 2.28 2.36 3
# Here's the code I have so far...
var.num_ <- dim(mydata)[2]
for(i in 1:length(vars)) {
mydata[, var.num + i] <- recode(mydata[, vars[i]], "newvar")
names(mydata)[var.num + i] <- paste0("newvar", vars[i])
}
# This code generates new variables, but they all have NA values.
# I also get this error message:
Unreplaced values treated as NA as .x is not compatible.
Please specify replacements exhaustively or supply .default
# Does anyone have any tips or general suggestions on how to use
# a for loop to multiply variables and create new product variables?
You could simply do the subset then multiply:
df[2:4]*df[5:7]
var1874 var1875 var1876
1 2.0592 2.3240 3.0690
2 1.5870 1.5870 2.2962
3 0.9576 1.1712 1.5576
4 1.8696 2.2184 3.2700
If you do not know the number of columns, but the data is arranged as given, then you could simply do:
df %>%
transmute(across(starts_with('var')) * across(starts_with('num')))
var1874 var1875 var1876
1 2.0592 2.3240 3.0690
2 1.5870 1.5870 2.2962
3 0.9576 1.1712 1.5576
4 1.8696 2.2184 3.2700
What if the data is disorganized? ie you are not sure that the way the year is aranged is the way num is arranged. Then do:
df %>%
pivot_longer(-cty, names_to = c('.value', 'grp'),
names_pattern = '(\\D+)(\\d{4})') %>%
mutate(newvar = var * num) %>%
pivot_wider(cty, grp, values_from = newvar, names_prefix = 'newvar')
cty newvar1874 newvar1875 newvar1876
<int> <dbl> <dbl> <dbl>
1 1 2.06 2.32 3.07
2 2 1.59 1.59 2.30
3 3 0.958 1.17 1.56
4 4 1.87 2.22 3.27
In base R, the same can be done as:
Using base R we can take advantage of vector multiplication.
# Data --------------------------------------------------------------------
df <- data.frame(cty = 1:4,
var1874 = c(.78, .69, .42, .82),
var1875 = c(.83, .69, .48, .94),
var1876 = c(.99, .89, .59, 1.09),
num1874 = c(2.64, 2.3, 2.28, 2.28),
num1875 = c(2.8, 2.3, 2.44, 2.36),
num1876 = c(3.1, 2.58, 2.64, 3))
# code --------------------------------------------------------------------
nms_var <- paste0(c('var187'), 4:6)
nms_num <- gsub('var', 'num', nms_var)
nms_result <- gsub('var', 'new_var', nms_var)
for (i in 1:length(nms_var)) {
df[, nms_result[[i]]] <- df[, nms_var[i]] * df[, nms_num[i]]
}
df
#> cty var1874 var1875 var1876 num1874 num1875 num1876 new_var1874 new_var1875
#> 1 1 0.78 0.83 0.99 2.64 2.80 3.10 2.0592 2.3240
#> 2 2 0.69 0.69 0.89 2.30 2.30 2.58 1.5870 1.5870
#> 3 3 0.42 0.48 0.59 2.28 2.44 2.64 0.9576 1.1712
#> 4 4 0.82 0.94 1.09 2.28 2.36 3.00 1.8696 2.2184
#> new_var1876
#> 1 3.0690
#> 2 2.2962
#> 3 1.5576
#> 4 3.2700
Created on 2021-11-27 by the reprex package (v2.0.1)

applying a rolling error function using rollapply

I am trying to calculate a rolling error function in R where I take the last 30 days and compute the rmse, move forward 1 day and take the last 30 days from this point and compute the new rmse.
My data looks like the following where I have a date and two values:
dates val1 val2
1 2010-01-01 -0.479526441 -0.294149127
2 2010-01-02 -0.860588950 0.426375720
3 2010-01-03 -0.660643894 -1.483020861
4 2010-01-04 -0.938748812 -1.631823690
Where am I going wrong in the code?
Data & attempt:
d <- data.frame(
dates = seq(from = as.Date("2010-01-01"), to = as.Date("2012-12-31"), by = 1),
val1 = rnorm(1096),
val2 = rnorm(1096)
)
d %>%
mutate(rollRMSE = rollapply(., width = 30, by = 1, FUN = Metrics::rmse(val1, val2)))
EDIT : setting window size as a variable
I have sliced the steps, rollapply will result in 29 less data with a 30 window so may want to collect this in another tibble.
suppressPackageStartupMessages(library(dplyr))
d <- data.frame(
dates = seq(from = as.Date("2010-01-01"), to = as.Date("2012-12-31"), by = 1),
val1 = rnorm(1096),
val2 = rnorm(1096)
)
rse <- function(x, y){sqrt((x-y)**2)}
# assign window size for moving average
window <- 30
d %>% tibble::as_tibble() %>%
mutate(err = rse(val1, val2),
roll = c(zoo::rollapply(err, width = window, by = 1, FUN = mean), rep(NA, window -1) )
)
#> # A tibble: 1,096 x 5
#> dates val1 val2 err roll
#> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 2010-01-01 -0.0248 1.18 1.20 1.40
#> 2 2010-01-02 -0.684 0.603 1.29 1.38
#> 3 2010-01-03 -0.344 -1.92 1.58 1.42
#> 4 2010-01-04 0.447 0.319 0.128 1.38
#> 5 2010-01-05 0.123 -0.810 0.933 1.42
#> 6 2010-01-06 0.00384 2.29 2.29 1.43
#> 7 2010-01-07 -1.51 -1.03 0.487 1.39
#> 8 2010-01-08 0.394 -1.25 1.64 1.41
#> 9 2010-01-09 -1.30 1.61 2.92 1.42
#> 10 2010-01-10 0.394 0.117 0.278 1.33
#> # ... with 1,086 more rows
You can do it manually with base R, i.e.
sapply(seq(0, (nrow(d) - 30)), function(i) Metrics::rmse(d$val1[(seq(30) + i)], d$val2[(seq(30) + i)]))

Calculate predicted model results by iterating through variables

I have several models fit to predict an outcome y = x1 + x2 + .....+x22. That's a fair number of predictors and a fair number of models. My customers want to know what's the marginal impact of each X on the estimated y. The models may include splines and interaction terms. I can do this, but it's cumbersome and requires loops or a lot of copy paste, which is slow or error prone. Can I do this better by writing my function differently and/or using purrr or an *apply function? Reproducible example is below. Ideally, I could write one function and apply it to longdata.
## create my fake data.
library(tidyverse)
library (rms)
ltrans<- function(l1){
newvar <- exp(l1)/(exp(l1)+1)
return(newvar)
}
set.seed(123)
mystates <- c("AL","AR","TN")
mydf <- data.frame(idno = seq(1:1500),state = rep(mystates,500))
mydf$x1[mydf$state=='AL'] <- rnorm(500,50,7)
mydf$x1[mydf$state=='AR'] <- rnorm(500,55,8)
mydf$x1[mydf$state=='TN'] <- rnorm(500,48,10)
mydf$x2 <- sample(1:5,500, replace = T)
mydf$x3 <- (abs(rnorm(1500,10,20)))^2
mydf$outcome <- as.numeric(cut2(sample(1:100,1500,replace = T),95))-1
dd<- datadist(mydf)
options(datadist = 'dd')
m1 <- lrm(outcome ~ x1 + x2+ rcs(x3,3), data = mydf)
dothemath <- function(x1 = x1ref,x2 = x2ref,x3 = x3ref) {
ltrans(-2.1802256-0.01114239*x1+0.050319692*x2-0.00079289232* x3+
7.6508189e-10*pmax(x3-7.4686271,0)^3-9.0897627e-10*pmax(x3- 217.97865,0)^3+
1.4389439e-10*pmax(x3-1337.2538,0)^3)}
x1ref <- 51.4
x2ref <- 3
x3ref <- 217.9
dothemath() ## 0.0591
mydf$referent <- dothemath()
mydf$thisobs <- dothemath(x1 = mydf$x1, x2 = mydf$x2, x3 = mydf$x3)
mydf$predicted <- predict(m1,mydf,type = "fitted.ind") ## yes, matches.
mydf$x1_marginaleffect <- dothemath(x1= mydf$x1)/mydf$referent
mydf$x2_marginaleffect <- dothemath(x2 = mydf$x2)/mydf$referent
mydf$x3_marginaleffect <- dothemath(x3 = mydf$x3)/mydf$referent
## can I do this with long data?
longdata <- mydf %>%
select(idno,state,referent,thisobs,x1,x2,x3) %>%
gather(varname,value,x1:x3)
##longdata$marginaleffect <- dothemath(longdata$varname = longdata$value) ## no, this does not work.
## I need to communicate to the function which variable it is evaluating.
longdata$marginaleffect[longdata$varname=="x1"] <- dothemath(x1 = longdata$value[longdata$varname=="x1"])/
longdata$referent[longdata$varname=="x1"]
longdata$marginaleffect[longdata$varname=="x2"] <- dothemath(x2 = longdata$value[longdata$varname=="x2"])/
longdata$referent[longdata$varname=="x2"]
longdata$marginaleffect[longdata$varname=="x3"] <- dothemath(x3 = longdata$value[longdata$varname=="x3"])/
longdata$referent[longdata$varname=="x3"]
testing<- inner_join(longdata[longdata$varname=="x1",c(1,7)],mydf[,c(1,10)])
head(testing) ## yes, both methods work.
Mostly you're just talking about a grouped mutate, with the caveat that dothemath is built such that you need to specify the variable name, which can be done by using do.call or purrr::invoke to call it on a named list of parameters:
longdata <- longdata %>%
group_by(varname) %>%
mutate(marginaleffect = invoke(dothemath, setNames(list(value), varname[1])) / referent)
longdata
#> # A tibble: 4,500 x 7
#> # Groups: varname [3]
#> idno state referent thisobs varname value marginaleffect
#> <int> <fct> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 1 AL 0.0591 0.0688 x1 46.1 1.06
#> 2 2 AR 0.0591 0.0516 x1 50.2 1.01
#> 3 3 TN 0.0591 0.0727 x1 38.0 1.15
#> 4 4 AL 0.0591 0.0667 x1 48.4 1.03
#> 5 5 AR 0.0591 0.0515 x1 47.1 1.05
#> 6 6 TN 0.0591 0.0484 x1 37.6 1.15
#> 7 7 AL 0.0591 0.0519 x1 60.9 0.905
#> 8 8 AR 0.0591 0.0531 x1 63.2 0.883
#> 9 9 TN 0.0591 0.0780 x1 47.8 1.04
#> 10 10 AL 0.0591 0.0575 x1 50.5 1.01
#> # ... with 4,490 more rows
# the first values look similar
inner_join(longdata[longdata$varname == "x1", c(1,7)], mydf[,c(1,10)])
#> Joining, by = "idno"
#> # A tibble: 1,500 x 3
#> idno marginaleffect x1_marginaleffect
#> <int> <dbl> <dbl>
#> 1 1 1.06 1.06
#> 2 2 1.01 1.01
#> 3 3 1.15 1.15
#> 4 4 1.03 1.03
#> 5 5 1.05 1.05
#> 6 6 1.15 1.15
#> 7 7 0.905 0.905
#> 8 8 0.883 0.883
#> 9 9 1.04 1.04
#> 10 10 1.01 1.01
#> # ... with 1,490 more rows
# check everything is the same
mydf %>%
gather(varname, marginaleffect, x1_marginaleffect:x3_marginaleffect) %>%
select(idno, varname, marginaleffect) %>%
mutate(varname = substr(varname, 1, 2)) %>%
all_equal(select(longdata, idno, varname, marginaleffect))
#> [1] TRUE
It may be easier to reconfigure dothemath to take an additional parameter of the variable name so as to avoid the gymnastics.

calculating qchisq in on a sparklyr tbl

I need to use the qchisq function on a column of a sparklyr data frame.
The problem is that it seems that qchisq function is not implemented in Spark. If I am reading the error message below correctly, sparklyr tried execute a function called "QCHISQ", however this doesn't exist neither in Hive SQL, nor in Spark.
In general, is there a way to run arbitrary functions that are not implemented in Hive or Spark, with sparklyr? I know about spark_apply, but haven't figured out how to configure it yet.
> mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1))
> mydf_tbl = copy_to(con, mydf)
> mydf_tbl
# Source: table<mydf> [?? x 2]
# Database: spark_connection
beta pval
<dbl> <dbl>
1 3.42 0.0913
2 -1.72 0.0629
3 0.515 0.0335
4 -3.12 0.0717
5 -2.12 0.0253
6 1.36 0.00640
7 -3.33 0.0896
8 1.36 0.0235
9 0.619 0.0414
10 4.73 0.0416
> mydf_tbl %>% mutate(se = sqrt(beta^2/qchisq(pval)))
Error: org.apache.spark.sql.AnalysisException: Undefined function: 'QCHISQ'.
This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 49
As you noted you can use spark_apply:
mydf_tbl %>%
spark_apply(function(df)
dplyr::mutate(df, se = sqrt(beta^2/qchisq(pval, df = 12))))
# # Source: table<sparklyr_tmp_14bd5feacf5> [?? x 3]
# # Database: spark_connection
# beta pval X3
# <dbl> <dbl> <dbl>
# 1 1.66 0.0763 0.686
# 2 0.153 0.0872 0.0623
# 3 2.96 0.0485 1.30
# 4 4.86 0.0349 2.22
# 5 -1.82 0.0712 0.760
# 6 2.34 0.0295 1.10
# 7 3.54 0.0297 1.65
# 8 4.57 0.0784 1.88
# 9 4.94 0.0394 2.23
# 10 -0.610 0.0906 0.246
# # ... with more rows
but fair warning - it is embarrassingly slow. Unfortunately you don't have alternative here, short of writing your own Scala / Java extensions.
In the end I've used an horrible hack, which for this case works fine.
Another solution would have been to write a User Defined Function (UDF), but sparklyr doesn't support it yet: https://github.com/rstudio/sparklyr/issues/1052
This is the hack I've used. In short, I precompute a qchisq table, upload it as a sparklyr object, then join. If I compare this with results calculated on a local data frame, I get a correlation of r=0.99999990902236146617.
#' #param n: number of significant digits to use
> check_precomputed_strategy = function(n) {
chisq = data.frame(pval=seq(0, 1, 1/(10**(n)))) %>%
mutate(qval=qchisq(pval, df=1, lower.tail = FALSE)) %>%
mutate(pval_s = as.character(round(as.integer(pval*10**n),0)))
chisq %>% head %>% print
chisq_tbl = copy_to(con, chisq, overwrite=T)
mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1)) %>%
mutate(se1 = sqrt(beta^2/qchisq(pval, df=1, lower.tail = FALSE)))
mydf_tbl = copy_to(con, mydf)
mydf_tbl.up = mydf_tbl %>%
mutate(pval_s=as.character(round(as.integer(pval*10**n),0))) %>%
left_join(chisq_tbl, by="pval_s") %>%
mutate(se=sqrt(beta^2 / qval)) %>%
collect %>%
filter(!duplicated(beta))
mydf_tbl.up %>% head %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% nrow %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% select(se, se1) %>% cor
}
> check_precomputed_strategy(4)
pval qval pval_s
1 0.00000000000000000000000 Inf 0
2 0.00010000000000000000479 15.136705226623396570 1
3 0.00020000000000000000958 13.831083619091122827 2
4 0.00030000000000000002793 13.070394140069462097 3
5 0.00040000000000000001917 12.532193305401813532 4
6 0.00050000000000000001041 12.115665146397173402 5
# A tibble: 6 x 8
beta pval.x se1 myvar pval_s pval.y qval se
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 3.42 0.0913 2.03 1. 912 0.0912 2.85 2.03
2 -1.72 0.0629 0.927 1. 628 0.0628 3.46 0.927
3 0.515 0.0335 0.242 1. 335 0.0335 4.52 0.242
4 -3.12 0.0717 1.73 1. 716 0.0716 3.25 1.73
5 -2.12 0.0253 0.947 1. 253 0.0253 5.00 0.946
6 1.36 0.00640 0.498 1. 63 0.00630 7.46 0.497
[1] 100
se se1
se 1.00000000000000000000 0.99999990902236146617
se1 0.99999990902236146617 1.00000000000000000000

Resources