I have the following data
sim_model = "
x =~ 1.3*x1 + 1.2*x2 + 1.1*x3 + 1.2*x4
y =~ 1*y1 + 1.2*y2 + 1.3*y3 + 0.9*y4
y ~ 0.6*x
"
sim_data = lavaan::simulateData(sim_model)
model = "
x =~ x1 + x2 + x3 + x4
y =~ y1 + y2 + y3 + y4
y ~ x
"
sd_d = data.frame(sd_d = apply(sim_data, 2, sd))
I need to multiply each column of the sim_data with its corresponding standard deviation which is in sd_d.
Any help?
Perhaps this does work? This adds columns to the end of the data frame containing your requested values.
sim_data |>
mutate(SDxX1 = sd(x1)*x1,
SDxX2 = sd(x2)*x2,
SDxX3 = sd(x3)*x3,
SDxX4 = sd(x4)*x4,
SDxY1 = sd(y1)*y1,
SDxY2 = sd(y2)*y2,
SDxY3 = sd(y3)*y3,
SDxY4 = sd(y4)*y4
)
How about this:
library(dplyr)
sim_model = "
x =~ 1.3*x1 + 1.2*x2 + 1.1*x3 + 1.2*x4
y =~ 1*y1 + 1.2*y2 + 1.3*y3 + 0.9*y4
y ~ 0.6*x
"
sim_data = lavaan::simulateData(sim_model)
model = "
x =~ x1 + x2 + x3 + x4
y =~ y1 + y2 + y3 + y4
y ~ x
"
sd_d = data.frame(sd_d = apply(sim_data, 2, sd))
new_sim_data <- sapply(1:ncol(sim_data), function(i)
sim_data[[i]]*sd_d$sd_d[i])
head(new_sim_data)
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 3.9817268 4.4529682 1.7882380 2.621278 -0.3147092 0.7048940
#> [2,] -0.8972617 -0.4562149 -0.1165654 1.318948 0.4359371 0.4220787
#> [3,] 1.9188604 0.9183960 5.3265835 4.025215 1.6147254 1.8146241
#> [4,] -4.2811180 -0.4473838 -1.4982330 -1.325111 -2.5972828 -0.7700888
#> [5,] -2.8633480 2.4930664 1.9927546 -1.186898 3.8177569 4.4855348
#> [6,] 1.0197316 0.7887374 2.2055450 2.039363 2.8806220 9.2947559
#> [,7] [,8]
#> [1,] 2.2355215 -1.3586282
#> [2,] -1.0632624 -0.6658058
#> [3,] 0.1758628 0.1879555
#> [4,] -3.0958775 2.8376086
#> [5,] 4.5647521 3.5110156
#> [6,] 7.0123519 1.3295521
Created on 2022-11-26 by the reprex package (v2.0.1)
Here are a couple of tidyverse solutions. It would be easiest to just calculate everything you need from within the dataframe. If you really need to use sd_d then it is possible, but a little more convoluted:
library(tidyverse)
#easiest solution
sim_data |>
mutate(across(x1:y4, list(sd = \(x) sd(x)*x)))
#> # A tibble: 500 x 16
#> x1 x2 x3 x4 y1 y2 y3 y4 x1_sd x2_sd x3_sd
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.926 0.468 0.476 -0.252 -2.53 -0.374 0.690 0.354 -1.47 0.728 0.697
#> 2 -0.813 -0.782 -0.820 0.300 0.978 1.48 -0.566 1.45 -1.29 -1.22 -1.20
#> 3 -3.28 -2.18 -2.48 -1.12 -0.235 -0.562 0.919 1.78 -5.20 -3.39 -3.63
#> 4 1.18 2.49 2.52 2.46 2.97 3.31 0.881 -1.10 1.87 3.87 3.69
#> 5 1.33 0.328 1.83 -2.04 0.204 1.40 1.94 1.77 2.10 0.511 2.68
#> 6 -0.388 -3.12 -1.03 0.137 -0.333 -0.167 0.482 -0.417 -0.615 -4.85 -1.50
#> 7 -1.12 -1.83 0.143 -1.75 0.169 -1.51 -2.40 -0.793 -1.77 -2.84 0.209
#> 8 -3.04 -0.899 -2.77 -0.586 -1.98 -1.13 -2.97 -1.64 -4.81 -1.40 -4.06
#> 9 -0.856 1.46 1.37 -0.617 1.45 -0.149 -0.169 0.842 -1.36 2.26 2.00
#> 10 0.492 0.506 0.616 -1.23 -0.841 0.132 -0.528 -1.51 0.779 0.786 0.902
#> # ... with 490 more rows, and 5 more variables: x4_sd <dbl>, y1_sd <dbl>,
#> # y2_sd <dbl>, y3_sd <dbl>, y4_sd <dbl>
#less easy solution
map_dfc(colnames(sim_data), \(col){
tibble(!!sym(col) := pull(sim_data, {{col}}),
!!sym(paste0(col, "_sd")) := pull(sim_data, {{col}}) * sd_d[rownames(sd_d) == col,])
}) |>
select(colnames(sim_data), contains("_sd"))
#> # A tibble: 500 x 16
#> x1 x2 x3 x4 y1 y2 y3 y4 x1_sd x2_sd x3_sd
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.926 0.468 0.476 -0.252 -2.53 -0.374 0.690 0.354 -1.47 0.728 0.697
#> 2 -0.813 -0.782 -0.820 0.300 0.978 1.48 -0.566 1.45 -1.29 -1.22 -1.20
#> 3 -3.28 -2.18 -2.48 -1.12 -0.235 -0.562 0.919 1.78 -5.20 -3.39 -3.63
#> 4 1.18 2.49 2.52 2.46 2.97 3.31 0.881 -1.10 1.87 3.87 3.69
#> 5 1.33 0.328 1.83 -2.04 0.204 1.40 1.94 1.77 2.10 0.511 2.68
#> 6 -0.388 -3.12 -1.03 0.137 -0.333 -0.167 0.482 -0.417 -0.615 -4.85 -1.50
#> 7 -1.12 -1.83 0.143 -1.75 0.169 -1.51 -2.40 -0.793 -1.77 -2.84 0.209
#> 8 -3.04 -0.899 -2.77 -0.586 -1.98 -1.13 -2.97 -1.64 -4.81 -1.40 -4.06
#> 9 -0.856 1.46 1.37 -0.617 1.45 -0.149 -0.169 0.842 -1.36 2.26 2.00
#> 10 0.492 0.506 0.616 -1.23 -0.841 0.132 -0.528 -1.51 0.779 0.786 0.902
#> # ... with 490 more rows, and 5 more variables: x4_sd <dbl>, y1_sd <dbl>,
#> # y2_sd <dbl>, y3_sd <dbl>, y4_sd <dbl>
Related
I am looking for a nice tidy/dplyr approach to compute the difference between all possible pair of columns (including repeats e.g A-B & B-A) in a dataframe.
I start with df and would like to end with end_df:
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.1
#> Warning: package 'tibble' was built under R version 4.2.1
df <- tibble(A = rnorm(1:10),
B = rnorm(1:10),
C = rnorm(1:10))
print(df)
#> # A tibble: 10 × 3
#> A B C
#> <dbl> <dbl> <dbl>
#> 1 -0.292 1.27 0.783
#> 2 -1.11 0.254 -0.410
#> 3 2.05 1.67 1.35
#> 4 1.31 0.0329 -1.29
#> 5 -1.67 -0.379 -0.696
#> 6 -1.02 -0.686 1.43
#> 7 -0.291 -0.0728 0.336
#> 8 -0.507 0.350 1.70
#> 9 -0.707 0.961 -0.493
#> 10 0.0459 -0.299 -0.0113
end_df <- df %>%
mutate( "A-B" = A-B,
"A-C" = A-C,
"B-A" = B-A,
"B-C" = B-C,
"C-A" = C-A,
"C-B" = C-B)
print(end_df)
#> # A tibble: 10 × 9
#> A B C `A-B` `A-C` `B-A` `B-C` `C-A` `C-B`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.292 1.27 0.783 -1.56 -1.08 1.56 0.482 1.08 -0.482
#> 2 -1.11 0.254 -0.410 -1.37 -0.703 1.37 0.664 0.703 -0.664
#> 3 2.05 1.67 1.35 0.380 0.702 -0.380 0.321 -0.702 -0.321
#> 4 1.31 0.0329 -1.29 1.28 2.60 -1.28 1.33 -2.60 -1.33
#> 5 -1.67 -0.379 -0.696 -1.29 -0.975 1.29 0.317 0.975 -0.317
#> 6 -1.02 -0.686 1.43 -0.334 -2.44 0.334 -2.11 2.44 2.11
#> 7 -0.291 -0.0728 0.336 -0.218 -0.627 0.218 -0.409 0.627 0.409
#> 8 -0.507 0.350 1.70 -0.857 -2.20 0.857 -1.35 2.20 1.35
#> 9 -0.707 0.961 -0.493 -1.67 -0.215 1.67 1.45 0.215 -1.45
#> 10 0.0459 -0.299 -0.0113 0.345 0.0572 -0.345 -0.288 -0.0572 0.288
Created on 2022-09-05 by the reprex package (v2.0.1)
You can get a list of all of the pairs of names, and then create a list of columns of the original dataframe mutated, the bind them:
pairs <- expand.grid(names(df), names(df)) %>%
filter(Var1 != Var2)
map2(pairs$Var1, pairs$Var2, function(x, y) as_tibble_col(df[[x]] - df[[y]], str_c(x, "-", y))) %>%
bind_cols(df, .)
# # A tibble: 10 × 9
# A B C `B-A` `C-A` `A-B` `C-B` `A-C` `B-C`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.199 0.110 0.0148 -0.0895 -0.184 0.0895 -0.0948 0.184 0.0948
# 2 -0.851 -0.413 0.338 0.438 1.19 -0.438 0.751 -1.19 -0.751
# 3 -1.13 0.112 -1.97 1.24 -0.835 -1.24 -2.08 0.835 2.08
# 4 0.597 -2.89 -2.32 -3.49 -2.92 3.49 0.572 2.92 -0.572
# 5 -1.10 0.0953 0.996 1.19 2.09 -1.19 0.900 -2.09 -0.900
# 6 0.0191 0.500 1.17 0.481 1.15 -0.481 0.667 -1.15 -0.667
# 7 0.416 0.949 -0.865 0.533 -1.28 -0.533 -1.81 1.28 1.81
# 8 1.84 -1.66 -1.39 -3.50 -3.23 3.50 0.267 3.23 -0.267
# 9 0.406 -1.48 -1.33 -1.89 -1.74 1.89 0.149 1.74 -0.149
# 10 0.393 -0.491 -0.139 -0.884 -0.532 0.884 0.352 0.532 -0.352
I want to tidy up a dataframe and automate the process. Given the following data.frame:
library(survival)
library(rms)
library(broom)
library(tidyverse)
res.cox <- coxph(Surv(time, status) ~ rcs(age, 3) + sex + ph.ecog +
rcs(meal.cal, 4), data = lung)
output <- tidy(res.cox)
output
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 3)age -0.00306 0.0219 -0.140 0.889
# 2 rcs(age, 3)age' 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 rcs(meal.cal, 4)meal.cal -0.000416 0.00104 -0.400 0.689
# 6 rcs(meal.cal, 4)meal.cal' 0.00118 0.00232 0.509 0.611
# 7 rcs(meal.cal, 4)meal.cal'' -0.00659 0.0114 -0.577 0.564
I want to remove the rcs-spline information from term variable and be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age -0.00306 0.0219 -0.140 0.889
# 2 s2 age 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
# 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
# 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
I want the solution to easily work for other cases too so when you increase the number of knots:
res.cox2 <- coxph(Surv(time, status) ~ rcs(age, 4) + rcs(meal.cal, 6) +
sex + ph.ecog, data = lung)
output2 <- tidy(res.cox2)
output2
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 4)age 0.0419 0.0403 1.04 0.298
# 2 rcs(age, 4)age' -0.101 0.0806 -1.26 0.208
# 3 rcs(age, 4)age'' 0.569 0.388 1.47 0.142
# 4 rcs(meal.cal, 6)meal.cal -0.000974 0.00155 -0.631 0.528
# 5 rcs(meal.cal, 6)meal.cal' 0.00751 0.0115 0.655 0.512
# 6 rcs(meal.cal, 6)meal.cal'' -0.0217 0.0358 -0.607 0.544
# 7 rcs(meal.cal, 6)meal.cal''' 0.0614 0.123 0.501 0.616
# 8 rcs(meal.cal, 6)meal.cal'''' -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
you would be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age 0.0419 0.0403 1.04 0.298
# 2 s2 age -0.101 0.0806 -1.26 0.208
# 3 s3 age 0.569 0.388 1.47 0.142
# 4 s1 meal.cal -0.000974 0.00155 -0.631 0.528
# 5 s2 meal.cal 0.00751 0.0115 0.655 0.512
# 6 s3 meal.cal -0.0217 0.0358 -0.607 0.544
# 7 s4 meal.cal 0.0614 0.123 0.501 0.616
# 8 s5 meal.cal -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
etc...
My attempt so far gets me some of the way but I am not sure of the best way to deal with the ', '' (note the first term does not contain a ') etc.:
output %>%
mutate(rcs_indicator = str_detect(term, fixed("rcs(")),
term = str_replace_all(term, "rcs\\(.+?\\)", ""))
# term estimate std.error statistic p.value rcs_indicator
# <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 age -0.00306 0.0219 -0.140 0.889 TRUE
# 2 age' 0.0154 0.0261 0.592 0.554 TRUE
# 3 sex -0.525 0.192 -2.74 0.00620 FALSE
# 4 ph.ecog 0.421 0.131 3.22 0.00128 FALSE
# 5 meal.cal -0.000416 0.00104 -0.400 0.689 TRUE
# 6 meal.cal' 0.00118 0.00232 0.509 0.611 TRUE
# 7 meal.cal'' -0.00659 0.0114 -0.577 0.564 TRUE
It might be useful to just work with the terms I need to change directly:
unique(str_subset(output$term, fixed("rcs(")) %>%
str_replace_all("'", ""))
# [1] "rcs(age, 3)age" "rcs(meal.cal, 4)meal.cal"
I feel there is a way to do this in a simpler way than the steps I am doing.
Any suggestions?
Thanks
This one is clunky but it should work:
library(dplyr)
library(stringr)
output %>%
group_by(group =str_extract(term, 'rcs\\(.')) %>%
mutate(row = row_number()) %>%
mutate(term = str_replace_all(term, 'rcs\\(', paste0("s",row, " "))) %>%
mutate(term = ifelse(str_detect(term, 's\\d'),
str_extract(term, '.\\d\\s.*\\s'), term)) %>%
mutate(term = str_trim(term)) %>%
mutate(term = str_replace_all(term, '\\,', '')) %>%
ungroup() %>%
select(-c(group, row))
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 s1 age -0.00306 0.0219 -0.140 0.889
2 s2 age 0.0154 0.0261 0.592 0.554
3 sex -0.525 0.192 -2.74 0.00620
4 ph.ecog 0.421 0.131 3.22 0.00128
5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
6 s2 meal.cal 0.00118 0.00232 0.509 0.611
7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
This is also less elegant than desired, but should work for multiple knots
output %>%
mutate(is_spline = grepl("^rcs\\(.*?, \\d\\)", term),
n_term = str_count(term, "'") + 1,
pre = ifelse(is_spline, paste0('s', n_term, ' '), ""),
term = paste0(pre, gsub("(^rcs\\(.*?, \\d\\))|(\\'+$)", "", term))) %>%
select(-is_spline, -n_term, -pre)
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 s1 age -0.00306 0.0219 -0.140 0.889
#> 2 s2 age 0.0154 0.0261 0.592 0.554
#> 3 sex -0.525 0.192 -2.74 0.00620
#> 4 ph.ecog 0.421 0.131 3.22 0.00128
#> 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
#> 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
#> 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
I am trying to read formula from textfile and execute. This will work.
writeLines(con = "/tmp/test.txt",
text = "new_cols_e = b + c
new_cols_f = (a*pi +b)/c - d
new_cols_g = log(b)
new_cols_h = b * a")
set.seed(1)
df<-letters[1:4] %>% set_names() %>% map_df(~rnorm(10))
read formula from text file, mutate
readLines(con = '/tmp/test.txt') %>%
set_names(.,str_trim(sub("(.*)=.*","\\1",.),"both")) %>%
map(~eval(parse(text=.x),df)) %>%
bind_cols(df,.)
# A tibble: 10 x 8
a b c d new_cols_e new_cols_f new_cols_g new_cols_h
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.951 -0.259 0.398 -0.390 0.139 7.24 NaN -0.246
2 -0.389 0.394 -0.408 0.376 -0.0131 1.66 -0.930 -0.154
3 -0.284 -0.852 1.32 0.244 0.472 -1.56 NaN 0.242
4 0.857 2.65 -0.701 -1.43 1.95 -6.19 0.974 2.27
5 1.72 0.156 -0.581 1.78 -0.425 -11.4 -1.86 0.268
6 0.270 1.13 -1.00 0.134 0.129 -2.11 0.122 0.305
7 -0.422 -2.29 -0.668 0.766 -2.96 4.65 NaN 0.966
8 -1.19 0.741 0.945 0.955 1.69 -4.12 -0.300 -0.881
9 -0.331 -1.32 0.434 -0.0506 -0.883 -5.38 NaN 0.436
10 -0.940 0.920 1.01 -0.306 1.92 -1.72 -0.0836 -0.864
but this will not work, because new_cols_g is not recognized
writeLines(con = "/tmp/test.txt",
text = "new_cols_e = b + c
new_cols_f = (a*pi +b)/c - d
new_cols_g = log(b)
new_cols_h = b * a
new_cols_i = new_cols_g - b")
What I want to do is...
a b c d new_cols_e new_cols_f new_cols_g new_cols_h new_cols_i
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.951 -0.259 0.398 -0.390 0.139 7.24 NaN -0.246 NaN
2 -0.389 0.394 -0.408 0.376 -0.0131 1.66 -0.930 -0.154 -1.32
3 -0.284 -0.852 1.32 0.244 0.472 -1.56 NaN 0.242 NaN
4 0.857 2.65 -0.701 -1.43 1.95 -6.19 0.974 2.27 -1.67
5 1.72 0.156 -0.581 1.78 -0.425 -11.4 -1.86 0.268 -2.01
6 0.270 1.13 -1.00 0.134 0.129 -2.11 0.122 0.305 -1.01
7 -0.422 -2.29 -0.668 0.766 -2.96 4.65 NaN 0.966 NaN
8 -1.19 0.741 0.945 0.955 1.69 -4.12 -0.300 -0.881 -1.04
9 -0.331 -1.32 0.434 -0.0506 -0.883 -5.38 NaN 0.436 NaN
10 -0.940 0.920 1.01 -0.306 1.92 -1.72 -0.0836 -0.864 -1.00
I hope my question is clear and feasable. Thank you a lot for your help !
It is usually not advised to evaluate code as string. For your case here is a way you could do it.
library(dplyr)
readLines(con = '/tmp/test.txt') %>%
paste0(collapse = ',') %>%
sprintf('df %%>%% mutate(%s)', .) -> string
eval(parse(text=string))
# a b c d new_cols_e new_cols_f new_cols_g new_cols_h new_cols_i
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -0.626 1.51 0.919 1.36 2.43 -1.86 0.413 -0.947 -1.10
# 2 0.184 0.390 0.782 -0.103 1.17 1.34 -0.942 0.0716 -1.33
# 3 -0.836 -0.621 0.0746 0.388 -0.547 -43.9 NaN 0.519 NaN
# 4 1.60 -2.21 -1.99 -0.0538 -4.20 -1.35 NaN -3.53 NaN
# 5 0.330 1.12 0.620 -1.38 1.74 4.86 0.118 0.371 -1.01
# 6 -0.820 -0.0449 -0.0561 -0.415 -0.101 47.1 NaN 0.0369 NaN
# 7 0.487 -0.0162 -0.156 -0.394 -0.172 -9.33 NaN -0.00789 NaN
# 8 0.738 0.944 -1.47 -0.0593 -0.527 -2.16 -0.0578 0.697 -1.00
# 9 0.576 0.821 -0.478 1.10 0.343 -6.60 -0.197 0.473 -1.02
#10 -0.305 0.594 0.418 0.763 1.01 -1.64 -0.521 -0.181 -1.11
data
writeLines(con = "/tmp/test.txt",
text = "new_cols_e = b + c
new_cols_f = (a*pi +b)/c - d
new_cols_g = log(b)
new_cols_h = b * a
new_cols_i = new_cols_g - b")
A data frame have contains three variables:
from - character - the name of a measure
to - character - the name of another measure
covariance - numeric - the covariance between the two measures
Here's a link to the data. Below is the result of head(have):
from to covariance
a_airportscreener a_airportscreener 4.419285714
a_airportscreener e_airportscreener -1.328928571
a_airportscreener g_airportscreener -3.038928571
a_airportscreener p_airportscreener 0.3292857143
a_airportscreener pres_airportscreener 0.6452857143
a_automechanic a_automechanic 2.635535714
a_automechanic e_automechanic -0.3439285714
I want to create a data frame called need that records the covariances between prefixed versions of the same job title in separate columns. For example, the first row would look like:
job a_a a_e a_g a_p a_pres e_a e_e e_g e_p e_pres g_a g_e g_g g_p g_pres p_a p_e p_g p_p p_pres pres_a pres_e pres_g pres_p pres_pres
airportscreener 4.419 -1.329 -3.039 0.329 0.645 -1.329 2.333 2.441 -1.015 0.659 -3.039 2.441 14.253 3.070 0.977 0.329 -1.015 3.070 6.505 0.366 0.645 0.659 0.977 0.366 0.697
(I rounded the values in have to keep the example of need on the page, but this is not part of the question.)
Try this approach on your complete data
library(tidyverse)
cov_mat %>%
rownames_to_column() %>%
pivot_longer(cols =-rowname) %>%
mutate(key = paste0(sub("_.*", "\\1", name), "_", sub("_.*", "\\1", rowname)),
rowname = sub(".*_(.*)_.*", "\\1", rowname),
name = sub(".*_(.*)_.*", "\\1", name)) %>%
filter(rowname == name) %>%
select(-rowname) %>%
pivot_wider(names_from = key, values_from = value)
# A tibble: 58 x 26
# name a_a e_a g_a p_a pres_a a_e e_e g_e .....
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 airp… 4.42 -1.33 -3.04 0.329 0.645 -1.33 2.33 2.44
# 2 auto… 2.64 -0.344 6.26 -0.712 -0.595 -0.344 0.499 0.113
# 3 auto… 2.67 -0.466 2.36 -0.106 -0.878 -0.466 0.72 -5.95
# 4 blkj… 2.50 0.529 -6.79 0.0129 -0.0666 0.529 1.56 -8.58
# 5 blkt… 1.04 -0.00143 4.86 0.993 -0.194 -0.00143 0.229 -1.69
# 6 brid… 4.15 2.05 -11.5 -1.21 0.453 2.05 2.05 -9.09
# 7 cart… 1.79 0.458 -4.22 0.451 -0.410 0.458 1.23 3.54
# 8 chem… 2.29 0.479 12.4 -0.0384 -0.164 0.479 0.811 2.15
# 9 clth… 4.10 1.15 -18.9 1.77 0.728 1.15 1.7 -4.00
#10 coag… 2.23 -0.382 -7.79 -0.0190 0.460 -0.382 0.342 4.11
This is not as elegant as #Ronak Shah's answer, but I had been working on something similar, and thought it might be worth sharing for someone out there. It also uses pivot_longer and pivot_wider in latest tidyr.
library(readxl)
library(tidyr)
library(dplyr)
df <- read_excel("cov_data.xlsx")
need <- df %>%
separate(from, into = c('from1', 'job'), sep = '_') %>%
separate(to, into = 'to1', extra = 'drop', sep = '_') %>%
unite(comb1, from1, to1, remove = F) %>%
unite(comb2, to1, from1, remove = T) %>%
pivot_longer(c(comb1, comb2)) %>%
dplyr::select(-name) %>%
distinct() %>%
pivot_wider(names_from = value, values_from = covariance) %>%
dplyr::select(job, order(colnames(.)))
# A tibble: 58 x 26
job a_a a_e a_g a_p a_pres e_a e_e e_g e_p e_pres g_a g_e g_g g_p g_pres p_a p_e p_g
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 airp… 4.42 -1.33 -3.04 0.329 0.645 -1.33 2.33 2.44 -1.02 0.659 -3.04 2.44 14.3 3.07 0.977 0.329 -1.02 3.07
2 auto… 2.64 -0.344 6.26 -0.712 -0.595 -0.344 0.499 0.113 0.891 0.321 6.26 0.113 203. 5.16 0.645 -0.712 0.891 5.16
3 auto… 2.67 -0.466 2.36 -0.106 -0.878 -0.466 0.72 -5.95 0.431 0.194 2.36 -5.95 252. 4.65 -4.64 -0.106 0.431 4.65
4 blkj… 2.50 0.529 -6.79 0.0129 -0.0666 0.529 1.56 -8.58 -0.703 0.384 -6.79 -8.58 247. 2.11 1.68 0.0129 -0.703 2.11
5 blkt… 1.04 -0.00143 4.86 0.993 -0.194 -0.00143 0.229 -1.69 0.276 -0.0351 4.86 -1.69 260. 14.3 2.44 0.993 0.276 14.3
6 brid… 4.15 2.05 -11.5 -1.21 0.453 2.05 2.05 -9.09 -0.342 0.576 -11.5 -9.09 326. -2.07 0.992 -1.21 -0.342 -2.07
7 cart… 1.79 0.458 -4.22 0.451 -0.410 0.458 1.23 3.54 0.43 -0.0674 -4.22 3.54 478. 10.5 -1.21 0.451 0.43 10.5
8 chem… 2.29 0.479 12.4 -0.0384 -0.164 0.479 0.811 2.15 0.784 0.0469 12.4 2.15 238. 2.58 -2.05 -0.0384 0.784 2.58
9 clth… 4.10 1.15 -18.9 1.77 0.728 1.15 1.7 -4.00 1.65 0.133 -18.9 -4.00 193. -17.1 -6.81 1.77 1.65 -17.1
10 coag… 2.23 -0.382 -7.79 -0.0190 0.460 -0.382 0.342 4.11 0.161 0.0398 -7.79 4.11 444. 1.96 -7.55 -0.0190 0.161 1.96
I'm simulating data with a fluctuating number of variables. As part of the situation, I am needing to calculate a model matrix with all possible combinations. See the following reprex for an example. I am able to get all two-interactions by specifying the formula as ~ .*.. However, this particular dataset has 3 variables (ndim <- 3). I can get all two- and three-way interactions by specifying the formula as ~ .^3. The issue is that there may be 4+ variables that I need to calculate, so I would like to be able to generalize this. I have tried specifying the formula as ~ .^ndim, but this throws an error.
Is there a way define the power in the formula with a variable?
library(tidyverse)
library(mvtnorm)
library(modelr)
ndim <- 3
data <- rmvnorm(100, mean = rep(0, ndim)) %>%
as_tibble(.name_repair = ~ paste0("dim_", seq_len(ndim)))
model_matrix(data, ~ .*.)
#> # A tibble: 100 x 7
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 1 more variable: `dim_2:dim_3` <dbl>
model_matrix(data, ~ .^3)
#> # A tibble: 100 x 8
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 2 more variables: `dim_2:dim_3` <dbl>,
#> # `dim_1:dim_2:dim_3` <dbl>
model_matrix(data, ~.^ndim)
#> Error in terms.formula(object, data = data): invalid power in formula
Created on 2019-02-15 by the reprex package (v0.2.1)
You can use use as.formula with paste in model_matrix:
model_matrix(data, as.formula(paste0("~ .^", ndim)))