Loop on several variables with the same suffix in R - r

I have a database which looks like this but with much more rows and columns.
Several variables (x,y,z) measured at different time (1,2,3).
df <-
tibble(
x1 = rnorm(10),
x2 = rnorm(10),
x3 = rnorm(10),
y1 = rnorm(10),
y2 = rnorm(10),
y3 = rnorm(10),
z1 = rnorm(10),
z2 = rnorm(10),
z3 = rnorm(10),
)
I am trying to create dummies variables from the variables with the same suffix (measured at the same time) like this:
df <- df %>%
mutate(var1= ifelse(x1>0 & (y1<0.5 |z1<0.5),0,1)) %>%
mutate(var2= ifelse(x2>0 & (y2<0.5 |z2<0.5),0,1)) %>%
mutate(var3= ifelse(x3>0 & (y1<0.5 |z3<0.5),0,1))
I am used to coding in SAS or Stata, so I would like to use a function or a loop because I have many more variables in my database.
But I think I don't have the right approach in R to deal with this.
Thank you very much for your help !

{dplyover} makes this kind of operation easy (disclaimer: I'm the maintainer), given that your desired output contains a typo:
I think you want to use all variables with the same digit (1, 2, 3 and so on) in each calculation:
df <- df %>%
mutate(var1= ifelse(x1>0 & (y1<0.5 |z1<0.5),0,1)) %>%
mutate(var2= ifelse(x2>0 & (y2<0.5 |z2<0.5),0,1)) %>%
mutate(var3= ifelse(x3>0 & (y3<0.5 |z3<0.5),0,1))
If that is the case we can use dplyover::over to apply the same function over a vector. Here we construct the vector with extract_names("[0-9]{1}$") which gets us all ending numbers of our variable names here: c(1,2,3). We can then construct the variable names using a special syntax: .("x{.x}"). Here .x evaluates to the first number in our vector so it would return the object name x1 (not a string!) which we can use inside the function argument of over.
library(dplyr)
library(dplyover) # Only on GitHub: https://github.com/TimTeaFan/dplyover
df %>%
mutate(over(cut_names("^[a-z]{1}"),
~ ifelse(.("x{.x}") > 0 & (.("y{.x}") < 0.5 | .("z{.x}") < 0.5), 0, 1),
.names = "var{x}"
))
#> # A tibble: 10 x 12
#> x1 x2 x3 y1 y2 y3 z1 z2 z3 var1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.690 0.550 0.911 0.203 -0.111 0.530 -2.09 0.189 0.147 0
#> 2 -0.238 1.32 -0.145 0.744 1.05 -0.448 2.05 -1.04 1.50 1
#> 3 0.888 0.898 -1.46 -1.87 -1.14 1.59 1.91 -0.155 1.46 0
#> 4 -2.78 -1.34 -0.486 -0.0674 0.246 0.141 0.154 1.08 -0.319 1
#> 5 -1.20 0.835 1.28 -1.32 -0.674 0.115 0.362 1.06 0.515 1
#> 6 0.622 -0.713 0.0525 1.79 -0.427 0.819 -1.53 -0.885 0.00237 0
#> 7 -2.54 0.0197 0.942 0.230 -1.37 -1.02 -1.55 -0.721 -1.06 1
#> 8 -0.434 1.97 -0.274 0.848 -0.482 -0.422 0.197 0.497 -0.600 1
#> 9 -0.316 -0.219 0.467 -1.97 -0.718 -0.442 -1.39 -0.877 1.52 1
#> 10 -1.03 0.226 2.04 0.432 -1.02 -0.535 0.954 -1.11 0.804 1
#> # ... with 2 more variables: var2 <dbl>, var3 <dbl>
Alternatively we can use dplyr::across and use cur_column(), get() and gsub() to alter the name of the column on the fly. To name the new variables correctly we use gsub() in the .names argument of across and wrap it in curly braces {} to evaluate the expression.
library(dplyr)
df %>%
mutate(across(starts_with("x"),
~ {
cur_c <- dplyr::cur_column()
ifelse(.x > 0 & (get(gsub("x","y", cur_c)) < 0.5 | get(gsub("x","z", cur_c)) < 0.5), 0, 1)
},
.names = '{gsub("x", "var", .col)}'
))
#> # A tibble: 10 x 12
#> x1 x2 x3 y1 y2 y3 z1 z2 z3 var1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.423 -1.42 -1.15 -1.54 1.92 -0.511 -0.739 0.501 0.451 1
#> 2 -0.358 0.164 0.971 -1.61 1.96 -0.675 -0.0188 -1.88 1.63 1
#> 3 -0.453 -0.758 -0.258 -0.449 -0.795 -0.362 -1.81 -0.780 -1.90 1
#> 4 0.855 0.335 -1.36 0.796 -0.674 -1.37 -1.42 -1.03 -0.560 0
#> 5 0.436 -0.0487 -0.639 0.352 -0.325 -0.893 -0.746 0.0548 -0.394 0
#> 6 -0.228 -0.240 -0.854 -0.197 0.884 0.118 -0.0713 1.09 -0.0289 1
#> 7 -0.949 -0.231 0.428 0.290 -0.803 2.15 -1.11 -0.202 -1.21 1
#> 8 1.88 -0.0980 -2.60 -1.86 -0.0258 -0.965 -1.52 -0.539 0.108 0
#> 9 0.221 1.58 -1.46 -0.806 0.749 0.506 1.09 0.523 1.86 0
#> 10 0.0238 -0.389 -0.474 0.512 -0.448 0.178 0.529 1.56 -1.12 1
#> # ... with 2 more variables: var2 <dbl>, var3 <dbl>
Created on 2022-06-08 by the reprex package (v2.0.1)

You could restructure your data along the principles of tidy data (see e.g. https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html).
Here to a long format and using tidyverse:
library(tidyverse)
df <-
df |>
pivot_longer(everything()) |>
separate(name, c("var", "time"), sep = "(?=[0-9])") |>
pivot_wider(id_col = "time",
names_from = "var",
names_prefix = "var_",
values_from = "value",
values_fn = list) |>
unnest(-time) |>
mutate(new_var = ifelse(var_x > 0 & (var_y < 0.5 | var_z < 0.5), 0, 1))
df
You would probably want to keep the data in a long format, but if you want, you can pivot_wider and get back to the format you started with. E.g.
df |>
pivot_wider(values_from = c(starts_with("var_"), "new_var"),
names_from = "time",
values_fn = list) |>
unnest(everything())

As you suggested, a solution using a loop is definitely possible.
# times as unique non-alphabetical parts of column names
times <- unique(gsub('[[:alpha:]]', '', names(df)))
for (time in times) {
# column names for current time
xyz <- paste0(c('x', 'y', 'z'), time)
df[[paste0('var', time)]] <-
ifelse(df[[xyz[1]]]>0 & (df[[xyz[2]]]<.5 | df[[xyz[3]]]<.5), 0, 1)
}
Another way I can think of is transforming the data into a 3D array (observartion × variable × time) so that you can actually do the computation for all variables at once.
times <- unique(gsub('[[:alpha:]]', '', names(df)))
df.arr <- sapply(c('x', 'y', 'z'),
function(var) as.matrix(df[, paste0(var, times)]),
simplify='array')
new.vars <- ifelse(df.arr[, , 1]>0 & (df.arr[, , 2]<0.5 | df.arr[, , 3]<0.5), 0, 1)
colnames(new.vars) <- paste0('var', times)
cbind(df, new.vars)
Here, sapply creates a matrix from columns of measurings for each variable at different times and stacks them into a 3D array.
If you trust (or ensure) correct ordering of columns in the data frame, instead of using sapply you can create the array just by modifying the object's dimensions. I didn't do any benchmarking but i guess this could be the most computationally efficient solution (if it should matter).
df.arr <- as.matrix(df)
dim(df.arr) <- c(dim(df.arr) / c(1, 3), 3)

Related

Implementing mutate within a function called with variables

I'd like to call a function several times with different variables, each time setting a value to a new variable in the data frame. Here is my failed attempt. I appreciate any help!
dat <- tibble(score1 = runif(10), score2 = score1 * 2)
call_mutate_with_vars <- function(df, var1, var2, var3) {
df <-
df %>%
mutate({{var3}} := ifelse({{var1}} >= {{var2}},0,{{var2}} - {{var1}}))
df
}
call_mutate_with_vars(dat,"score1","score2","newscore")
I receive this error:
Error: Problem with `mutate()` column `newscore`.
i `newscore = ifelse("score1" >= "score2", 0, "score2" - "score1")`.
x non-numeric argument to binary operator
Run `rlang::last_error()` to see where the error occurred.
The embracing operator {{ is meant for variables passed as symbols (i.e., fx(var), not fx("var")). If you need to pass your variables as characters, you can instead use the .data pronoun.
So you can either pass symbols to your current function:
library(dplyr)
set.seed(1)
call_mutate_with_vars <- function(df, var1, var2, var3) {
df %>%
mutate(
{{var3}} := ifelse(
{{var1}} >= {{var2}},
0,
{{var2}} - {{var1}}
)
)
}
call_mutate_with_vars(dat, score1, score2, newscore)
#> # A tibble: 10 x 3
#> score1 score2 newscore
#> <dbl> <dbl> <dbl>
#> 1 0.266 0.531 0.266
#> 2 0.372 0.744 0.372
#> 3 0.573 1.15 0.573
#> 4 0.908 1.82 0.908
#> 5 0.202 0.403 0.202
#> 6 0.898 1.80 0.898
#> 7 0.945 1.89 0.945
#> 8 0.661 1.32 0.661
#> 9 0.629 1.26 0.629
#> 10 0.0618 0.124 0.0618
Or rewrite the function to handle characters:
call_mutate_with_chr_vars <- function(df, var1, var2, var3) {
df %>%
mutate(
!!var3 := ifelse( # note use of !! unquote operator
.data[[var1]] >= .data[[var2]], # to use character as name
0,
.data[[var2]] - .data[[var1]]
)
)
}
call_mutate_with_chr_vars(dat, "score1", "score2", "newscore")
#> # A tibble: 10 x 3
#> score1 score2 newscore
#> <dbl> <dbl> <dbl>
#> 1 0.266 0.531 0.266
#> 2 0.372 0.744 0.372
#> 3 0.573 1.15 0.573
#> 4 0.908 1.82 0.908
#> 5 0.202 0.403 0.202
#> 6 0.898 1.80 0.898
#> 7 0.945 1.89 0.945
#> 8 0.661 1.32 0.661
#> 9 0.629 1.26 0.629
#> 10 0.0618 0.124 0.0618
Created on 2022-03-07 by the reprex package (v2.0.1)
The "Programming with dplyr" vignette is a nice reference for these sorts of problems.

How to flip column variable names to row labels? [duplicate]

This question already has answers here:
Transposing a dataframe maintaining the first column as heading
(5 answers)
Transposition of a Tibble Using Pivot_Longer() and Pivot_Wider (Tidyverse) [duplicate]
(1 answer)
Closed 1 year ago.
I have the below tibble.
A tibble: 2 x 6
Trial_Type CT_tib_all CT_lum_all CT_tho_all CT_gps_all CT_vest_all
* <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Pre 0.244 0.209 0.309 0.315 0.310
2 Post 0.254 0.211 0.302 0.313 0.316
I would like to flip the rows and columns so I end up with a 6 x 2 tibble, but I'm not sure of the easiest way to do this. How do I get the column variable names to become row labels and the row labels as column variables (Pre and Post)?
You can use pivot_longer and pivot_wider -
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -Trial_Type) %>%
pivot_wider(names_from = Trial_Type, values_from = value)
# name Pre Post
# <chr> <dbl> <dbl>
#1 CT_tib_all 0.244 0.254
#2 CT_lum_all 0.209 0.211
#3 CT_tho_all 0.309 0.302
#4 CT_gps_all 0.315 0.313
#5 CT_vest_all 0.31 0.316
In data.table -
library(data.table)
dcast(melt(setDT(df), id.vars = 'Trial_Type'),
variable~Trial_Type, vvalue.var = 'value')
t i.e. transpose function in baseR may also be used, in combination with tibble::rownames_to_column and tibble::column_to_rownames
library(tibble)
library(dplyr)
df <- read.table(text = 'Trial_Type CT_tib_all CT_lum_all CT_tho_all CT_gps_all CT_vest_all
Pre 0.244 0.209 0.309 0.315 0.310
Post 0.254 0.211 0.302 0.313 0.316', header = T)
df %>% tibble::column_to_rownames('Trial_Type') %>%
t() %>% as.data.frame() %>%
rownames_to_column('Trial_Type')
#> Trial_Type Pre Post
#> 1 CT_tib_all 0.244 0.254
#> 2 CT_lum_all 0.209 0.211
#> 3 CT_tho_all 0.309 0.302
#> 4 CT_gps_all 0.315 0.313
#> 5 CT_vest_all 0.310 0.316
Created on 2021-05-28 by the reprex package (v2.0.0)
We can use transpose from data.table
data.table::transpose(df, make.names = 'Trial_Type', keep.names = 'name')
# name Pre Post
#1 CT_tib_all 0.244 0.254
#2 CT_lum_all 0.209 0.211
#3 CT_tho_all 0.309 0.302
#4 CT_gps_all 0.315 0.313#
5 CT_vest_all 0.310 0.316
A base R option using reshape
reshape(
cbind(name = df$Trial_Type, stack(df[-1])),
direction = "wide",
idvar = "ind",
timevar = "name"
)
gives
ind values.Pre values.Post
1 CT_tib_all 0.244 0.254
3 CT_lum_all 0.209 0.211
5 CT_tho_all 0.309 0.302
7 CT_gps_all 0.315 0.313
9 CT_vest_all 0.310 0.316

Stack dataframe columns with two distinct suffix into two columns, preferably using tidyverse [duplicate]

This question already has answers here:
tidyverse pivot_longer several sets of columns, but avoid intermediate mutate_wider steps [duplicate]
(3 answers)
Closed 1 year ago.
Suppose I have a list of dataframes, mylist and I want to do the same operation to each dataframes.
Say my dataframes look like this:
set.seed(1)
test.tbl <- tibble(
case1_diff = rnorm(10,0),
case1_avg = rnorm(10,0),
case2_diff = rnorm(10,0),
case2_avg = rnorm(10,0),
case3_diff = rnorm(10,0),
case3_avg = rnorm(10,0),
case4_diff = rnorm(10,0),
case4_avg = rnorm(10,0),
)
> head(test.tbl)
# A tibble: 6 x 8
case1_diff case1_avg case2_diff case2_avg case3_diff case3_avg case4_diff case4_avg
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.626 1.51 0.919 1.36 -0.165 0.398 2.40 0.476
2 0.184 0.390 0.782 -0.103 -0.253 -0.612 -0.0392 -0.710
3 -0.836 -0.621 0.0746 0.388 0.697 0.341 0.690 0.611
4 1.60 -2.21 -1.99 -0.0538 0.557 -1.13 0.0280 -0.934
5 0.330 1.12 0.620 -1.38 -0.689 1.43 -0.743 -1.25
6 -0.820 -0.0449 -0.0561 -0.415 -0.707 1.98 0.189 0.291
and I wish to stack them into two columns of diff and avg as 40 x 2 dataframe.
Normally, I would just separate it into two objects through select(ends_with("diff")) and select(ends_with("avg")), pivot them, then bind_rows.
However, since my original object is list, I want to do it using map like:
mylist %>%
map(*insertfunction1*) %>%
map(*insertfunction2*)
meaning I would need to do this without separating. I would also need to make sure that diff and avg is correctly paired.
What I have tried so far is
test.tbl %>%
pivot_longer(cols=everything(),
names_to = "metric") %>%
mutate(metric = str_remove(metric,"[0-9]+")) %>%
pivot_wider(id_cols=metric,
values_from=value)
We don't need both pivot_longer and pivot_wider. it can be done within pivot_longer itself by specifying the names_to and the names_sep argument
library(dplyr)
library(tidyr)
test.tbl %>%
pivot_longer(cols = everything(), names_to = c('grp', '.value'),
names_sep = "_") %>%
select(-grp)
-output
# A tibble: 40 x 2
# diff avg
# <dbl> <dbl>
# 1 -0.626 1.51
# 2 0.919 1.36
# 3 -0.165 0.398
# 4 2.40 0.476
# 5 0.184 0.390
# 6 0.782 -0.103
# 7 -0.253 -0.612
# 8 -0.0392 -0.710
# 9 -0.836 -0.621
#10 0.0746 0.388
# … with 30 more rows

Produce a matrix from every row (tidyverse)

I am trying to implement analyses across a posterior of matrices. What I start with is a tibble of k^2 columns, where k is the dimensions of the matrix. The ith row forms the matrix of the ith iteration.
So, for example for a 3x3 matrix, this is:
set.seed(12)
n <- 1000
z1z1 <- rnorm(n, 5, 1)
z2z2 <- rnorm(n, 5, 1)
z3z3 <- rnorm(n, 5, 1)
z1z2 <- rnorm(n, 0, 1)
z1z3 <- rnorm(n, 0, 1)
z2z3 <- rnorm(n, 0, 1)
post3 <- as_tibble(matrix(c(z1z1, z1z2, z1z3,
z1z2, z2z2, z2z3,
z1z3, z2z3, z3z3),
ncol = 9))
post3
Giving:
# A tibble: 1,000 x 9
V1 V2 V3 V4 V5 V6 V7 V8 V9
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 3.52 -0.618 2.96 -0.618 2.48 -0.634 2.96 -0.634 5.98
2 6.58 -0.827 0.0909 -0.827 5.52 -1.84 0.0909 -1.84 6.20
3 4.04 1.48 -1.66 1.48 6.58 0.166 -1.66 0.166 5.58
4 4.08 -1.01 0.809 -1.01 5.49 0.607 0.809 0.607 6.55
5 3.00 0.582 -0.485 0.582 6.20 0.0765 -0.485 0.0765 6.38
6 4.73 0.718 1.97 0.718 4.00 -0.147 1.97 -0.147 4.35
7 4.68 -0.372 0.572 -0.372 4.65 -1.68 0.572 -1.68 3.83
8 4.37 -0.809 0.883 -0.809 3.96 0.985 0.883 0.985 4.97
9 4.89 0.405 0.686 0.405 6.02 0.252 0.686 0.252 6.29
10 5.43 0.124 0.199 0.124 5.75 0.354 0.199 0.354 4.20
# ... with 990 more rows
Where this is the matrix in the first iteration:
k <- sqrt(length(post3))
matrix(post3[1,], nrow = k)
[,1] [,2] [,3]
[1,] 3.519432 -0.618137 2.962622
[2,] -0.618137 2.479522 -0.6338298
[3,] 2.962622 -0.6338298 5.977552
I am then working along this posterior to calculate the dominance of the first eigenvector:
post3 %>%
rowwise %>%
mutate(
pre_eig = list(eigen(matrix(c(V1, V2, V3, V4, V5, V6, V7, V8, V9), nrow = k))),
dom = pre_eig[[1]][1] / sum(pre_eig[[1]][1:k])) %>%
select('dom')
Giving:
# A tibble: 1,000 x 1
dom
<dbl>
1 0.676
2 0.437
3 0.462
4 0.427
5 0.414
6 0.504
7 0.474
8 0.429
9 0.394
10 0.383
# ... with 990 more rows
What I would like to do is make this script versatile so that it can take posteriors for any value of k. The issue I am having is in how to define the matrix without having to hand write all the column names - when applying this to 2000x2000 matrices I don't want to write out V1, V2, V3... V4000000!
I tried a few things (including ...eigen(matrix(c(paste0('V', 1:(k^2))), nrow = k)))..., which I think is not working because it wants V1, V2... rather than "V1", "V2"...) and I all out of ideas. How do I get it to automatically take the column names from the posterior tibble?
I would then be able to use the exact same piece of script for example on post3 <- as_tibble(matrix(c(z1z1, z1z2, z1z2, z2z2), ncol = 4))...
You can avoid naming all the columns explicitly if you gather each row's values into key-value pairs:
library(tidyr)
post3 %>%
# add row ID (so that results can be sorted back into original order)
mutate(row.id = seq(1, n())) %>%
# convert each row to long format, with values sorted from 1st to k^2th column
gather(position, value, -row.id) %>%
mutate(position = as.numeric(gsub("^V", "", position))) %>%
arrange(row.id, position) %>%
select(-position) %>%
# group by row ID & calculate
group_by(row.id) %>%
summarise(pre_eig = list(eigen(matrix(value, nrow = k))[["values"]]),
dom = pre_eig[[1]][1] / sum(pre_eig[[1]][1:k])) %>%
ungroup() %>%
# sort results in original order
arrange(row.id) %>%
select(dom)
The results should be the same as before:
# A tibble: 1,000 x 1
dom
<dbl>
1 0.676
2 0.437
3 0.462
4 0.427
5 0.414
6 0.504
7 0.474
8 0.429
9 0.394
10 0.383
# ... with 990 more rows

calculating qchisq in on a sparklyr tbl

I need to use the qchisq function on a column of a sparklyr data frame.
The problem is that it seems that qchisq function is not implemented in Spark. If I am reading the error message below correctly, sparklyr tried execute a function called "QCHISQ", however this doesn't exist neither in Hive SQL, nor in Spark.
In general, is there a way to run arbitrary functions that are not implemented in Hive or Spark, with sparklyr? I know about spark_apply, but haven't figured out how to configure it yet.
> mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1))
> mydf_tbl = copy_to(con, mydf)
> mydf_tbl
# Source: table<mydf> [?? x 2]
# Database: spark_connection
beta pval
<dbl> <dbl>
1 3.42 0.0913
2 -1.72 0.0629
3 0.515 0.0335
4 -3.12 0.0717
5 -2.12 0.0253
6 1.36 0.00640
7 -3.33 0.0896
8 1.36 0.0235
9 0.619 0.0414
10 4.73 0.0416
> mydf_tbl %>% mutate(se = sqrt(beta^2/qchisq(pval)))
Error: org.apache.spark.sql.AnalysisException: Undefined function: 'QCHISQ'.
This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 49
As you noted you can use spark_apply:
mydf_tbl %>%
spark_apply(function(df)
dplyr::mutate(df, se = sqrt(beta^2/qchisq(pval, df = 12))))
# # Source: table<sparklyr_tmp_14bd5feacf5> [?? x 3]
# # Database: spark_connection
# beta pval X3
# <dbl> <dbl> <dbl>
# 1 1.66 0.0763 0.686
# 2 0.153 0.0872 0.0623
# 3 2.96 0.0485 1.30
# 4 4.86 0.0349 2.22
# 5 -1.82 0.0712 0.760
# 6 2.34 0.0295 1.10
# 7 3.54 0.0297 1.65
# 8 4.57 0.0784 1.88
# 9 4.94 0.0394 2.23
# 10 -0.610 0.0906 0.246
# # ... with more rows
but fair warning - it is embarrassingly slow. Unfortunately you don't have alternative here, short of writing your own Scala / Java extensions.
In the end I've used an horrible hack, which for this case works fine.
Another solution would have been to write a User Defined Function (UDF), but sparklyr doesn't support it yet: https://github.com/rstudio/sparklyr/issues/1052
This is the hack I've used. In short, I precompute a qchisq table, upload it as a sparklyr object, then join. If I compare this with results calculated on a local data frame, I get a correlation of r=0.99999990902236146617.
#' #param n: number of significant digits to use
> check_precomputed_strategy = function(n) {
chisq = data.frame(pval=seq(0, 1, 1/(10**(n)))) %>%
mutate(qval=qchisq(pval, df=1, lower.tail = FALSE)) %>%
mutate(pval_s = as.character(round(as.integer(pval*10**n),0)))
chisq %>% head %>% print
chisq_tbl = copy_to(con, chisq, overwrite=T)
mydf = data.frame(beta=runif(100, -5, 5), pval = runif(100, 0.001, 0.1)) %>%
mutate(se1 = sqrt(beta^2/qchisq(pval, df=1, lower.tail = FALSE)))
mydf_tbl = copy_to(con, mydf)
mydf_tbl.up = mydf_tbl %>%
mutate(pval_s=as.character(round(as.integer(pval*10**n),0))) %>%
left_join(chisq_tbl, by="pval_s") %>%
mutate(se=sqrt(beta^2 / qval)) %>%
collect %>%
filter(!duplicated(beta))
mydf_tbl.up %>% head %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% nrow %>% print
mydf_tbl.up %>% filter(complete.cases(.)) %>% select(se, se1) %>% cor
}
> check_precomputed_strategy(4)
pval qval pval_s
1 0.00000000000000000000000 Inf 0
2 0.00010000000000000000479 15.136705226623396570 1
3 0.00020000000000000000958 13.831083619091122827 2
4 0.00030000000000000002793 13.070394140069462097 3
5 0.00040000000000000001917 12.532193305401813532 4
6 0.00050000000000000001041 12.115665146397173402 5
# A tibble: 6 x 8
beta pval.x se1 myvar pval_s pval.y qval se
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 3.42 0.0913 2.03 1. 912 0.0912 2.85 2.03
2 -1.72 0.0629 0.927 1. 628 0.0628 3.46 0.927
3 0.515 0.0335 0.242 1. 335 0.0335 4.52 0.242
4 -3.12 0.0717 1.73 1. 716 0.0716 3.25 1.73
5 -2.12 0.0253 0.947 1. 253 0.0253 5.00 0.946
6 1.36 0.00640 0.498 1. 63 0.00630 7.46 0.497
[1] 100
se se1
se 1.00000000000000000000 0.99999990902236146617
se1 0.99999990902236146617 1.00000000000000000000

Resources