I have a dataset with multiple columns that follow a name pattern, and I need to calculate new columns that is the product of two other columns. I am looking for a tidyverse option, but I would want to avoid to do a pivot_longer as the dataset has >million rows.
Example dataset
library(dplyr)
df <- tibble(
jan_mean = runif(10),
feb_mean = runif(10),
mar_mean = runif(10),
jan_sd = runif(10),
feb_sd = runif(10),
mar_sd = runif(10),
)
I can do it manually like this:
df2 <- df %>%
mutate(jan_cv= jan_mean/jan_sd,
feb_cv= feb_mean/feb_sd,
mar_cv= mar_mean/mar_sd
)
This is a simple example, but I have similar operations for monthly values.
EDIT 1
I need to do this for large datasets and I was worried that pivot_longer would be quite consuming, so I did a quick comparison of the three methods.
Method 1 is the manual way, Method 2 is the short version suggested by #Tarjae, and Method 3 is using pivot longer:
tic("Method 1: manual option")
df2 <- df %>%
mutate(jan_cv= jan_mean/jan_sd,
feb_cv= feb_mean/feb_sd,
mar_cv= mar_mean/mar_sd
)
toc()
tic("Method 2: Short option")
df2 <- df %>%
mutate(across(ends_with('_mean'), ~ . /
get(str_replace(cur_column(), "mean$", "sd")), .names = "{.col}_cv")) %>%
rename_at(vars(ends_with('cv')), ~ str_remove(., "\\_mean"))
toc()
tic("Method 3: pivot wider option")
df2 <- df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c("month", ".value"), names_sep = "_") %>%
mutate(cv = mean / sd) %>%
pivot_wider(names_from = "month", values_from = c(mean, sd, cv), names_glue = "{month}_{.value}") %>%
select(-id)
toc()
The results are:
Method 1: manual option: 0.05 sec elapsed
Method 2: Short option: 0.01 sec elapsed
Method 3: pivot wider option: 0.19 sec elapsed
So method 2 is even faster than manually doing each column
We could use across in this situation with some string manipulation with stringr:
library(dplyr)
library(stringr)
df %>%
mutate(across(ends_with('_mean'), ~ . /
get(str_replace(cur_column(), "mean$", "sd")), .names = "{.col}_cv")) %>%
rename_at(vars(ends_with('cv')), ~ str_remove(., "\\_mean"))
jan_mean feb_mean mar_mean jan_sd feb_sd mar_sd jan_cv feb_cv mar_cv
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.838 0.401 0.131 0.329 0.0292 0.911 2.55 13.7 0.144
2 0.595 0.173 0.0935 0.313 0.105 0.247 1.90 1.64 0.378
3 0.0546 0.934 0.983 0.536 0.618 0.292 0.102 1.51 3.36
4 0.543 0.802 0.569 0.585 0.901 0.742 0.928 0.891 0.766
5 0.899 0.761 0.245 0.932 0.506 0.526 0.965 1.50 0.466
6 0.832 0.875 0.947 0.390 0.613 0.607 2.13 1.43 1.56
7 0.268 0.421 0.930 0.869 0.873 0.612 0.308 0.483 1.52
8 0.475 0.217 0.330 0.0473 0.826 0.903 10.0 0.262 0.366
9 0.379 0.425 0.479 0.931 0.381 0.223 0.407 1.12 2.15
10 0.616 0.922 0.707 0.976 0.241 0.619 0.631 3.82 1.14
One option to achieve your desired result would be to convert your data to long format which makes it easy to do the computations per month and if desired convert back to wide format afterwards. To this end I first added an identifier column to your data:
library(dplyr)
library(tidyr)
set.seed(42)
df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c("month", ".value"), names_sep = "_") %>%
mutate(cv = mean / sd) %>%
pivot_wider(names_from = "month", values_from = c(mean, sd, cv), names_glue = "{month}_{.value}") %>%
select(-id)
#> # A tibble: 10 × 9
#> jan_mean feb_mean mar_mean jan_sd feb_sd mar_sd jan_cv feb_cv mar_cv
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.915 0.458 0.904 0.738 0.380 0.333 1.24 1.21 2.71
#> 2 0.937 0.719 0.139 0.811 0.436 0.347 1.16 1.65 0.400
#> 3 0.286 0.935 0.989 0.388 0.0374 0.398 0.737 25.0 2.48
#> 4 0.830 0.255 0.947 0.685 0.974 0.785 1.21 0.262 1.21
#> 5 0.642 0.462 0.0824 0.00395 0.432 0.0389 163. 1.07 2.12
#> 6 0.519 0.940 0.514 0.833 0.958 0.749 0.623 0.982 0.687
#> 7 0.737 0.978 0.390 0.00733 0.888 0.677 100. 1.10 0.576
#> 8 0.135 0.117 0.906 0.208 0.640 0.171 0.648 0.184 5.29
#> 9 0.657 0.475 0.447 0.907 0.971 0.261 0.725 0.489 1.71
#> 10 0.705 0.560 0.836 0.612 0.619 0.514 1.15 0.905 1.63
Related
I have a simplified tibble where I select two columns (manually) and pass them to a custom function, but in this case just using sum. Any ideas on how I could expand this to accommodate any number of ko. In this case there's only 2, but let's say there were 5?
library(dplyr)
library(purrr)
df <- tibble(l2fc_ko1 = rnorm(1:10), l2fc_ko2 = rnorm(1:10), ctrl_ko1 = rnorm(1:10), ctrl_ko2 = rnorm(1:10))
df %>% mutate(ko1_sum = map2_dbl(ctrl_ko1, l2fc_ko1, sum),
ko2_sum = map2_dbl(ctrl_ko2, l2fc_ko2, sum))
We can use pivot_longer to reshape the data, creating a column for each level of ko. Compute the sum, then pivot_wider to get back to your original format:
library(tidyverse)
df %>%
mutate(idx = row_number()) %>%
pivot_longer(-idx, names_sep = '_', names_to = c('group', 'ko')) %>%
pivot_wider(names_from = group, values_from = value) %>%
mutate(sum = l2fc + ctrl) %>%
pivot_wider(names_from = ko, values_from = c(l2fc, ctrl, sum))
idx l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 sum_ko1 sum_ko2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -1.04 -0.710 -0.288 -1.65 -1.33 -2.36
2 2 0.0338 0.400 -0.850 0.319 -0.816 0.719
3 3 2.08 0.723 0.325 0.314 2.40 1.04
4 4 0.740 -0.411 -0.307 1.77 0.433 1.36
5 5 0.347 -1.57 -0.153 0.657 0.195 -0.915
6 6 -0.998 -0.145 0.265 -1.95 -0.733 -2.09
7 7 2.05 -0.0876 -0.909 -0.190 1.14 -0.278
8 8 0.0735 -0.134 -2.04 -0.832 -1.96 -0.966
9 9 1.52 2.37 1.53 -0.596 3.05 1.78
10 10 1.42 -0.753 -1.61 1.84 -0.194 1.09
If you have a dynamic number of paired ctrl_/l2fc_ columns, then try this:
Ensure we have all ctrl_ that have a corresponding l2fc_ (and vice versa):
ctrls <- grep("^ctrl_ko", names(df), value = TRUE)
l2fcs <- gsub("^ctrl", "l2fc", ctrls)
ctrls <- ctrls[ l2fcs %in% names(df) ]
l2fcs <- l2fcs[ l2fcs %in% names(df) ] # or intersect(l2fcs, names(df))
Combine these into one vector (we'll split on it later) and convert this to the new _sum names we'll need.
nms <- c(l2fcs, ctrls)
nms
# [1] "l2fc_ko1" "l2fc_ko2" "ctrl_ko1" "ctrl_ko2"
newnms <- gsub("ctrl_(.*)", "\\1_sum", ctrls)
newnms
# [1] "ko1_sum" "ko2_sum"
Using split.default (which will split the df into groups of columns) and rowSums, we can devise two _sum columns:
setNames(as.data.frame(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums)), newnms)
# ko1_sum ko2_sum
# 1 1.0643199 1.7603198
# 2 -2.3460066 2.9914827
# 3 0.1912111 -0.3537572
# 4 1.8475373 -0.8877151
# 5 2.2994618 0.3716338
# 6 -0.5365936 -1.0810583
# 7 1.2542526 -1.0687119
# 8 -1.8578221 -3.5073630
# 9 2.4785211 -4.8546746
# 10 -0.7027090 1.3562360
We can cbind/bind_cols those in, or we can mutate them just as well. For the latter, we'll replace df with cur_data() for within the mutate environment, and we'll need to add as.data.frame)
Choose one of the following, all producing effectively the same results:
cbind(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
bind_cols(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
df %>%
mutate(
setNames(
as.data.frame(
lapply(split.default(cur_data()[nms], gsub(".*_ko", "", nms)), rowSums)),
newnms)
)
# # A tibble: 10 x 6
# l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 ko1_sum ko2_sum
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1.37 1.30 -0.307 0.455 1.06 1.76
# 2 -0.565 2.29 -1.78 0.705 -2.35 2.99
# 3 0.363 -1.39 -0.172 1.04 0.191 -0.354
# 4 0.633 -0.279 1.21 -0.609 1.85 -0.888
# 5 0.404 -0.133 1.90 0.505 2.30 0.372
# 6 -0.106 0.636 -0.430 -1.72 -0.537 -1.08
# 7 1.51 -0.284 -0.257 -0.784 1.25 -1.07
# 8 -0.0947 -2.66 -1.76 -0.851 -1.86 -3.51
# 9 2.02 -2.44 0.460 -2.41 2.48 -4.85
# 10 -0.0627 1.32 -0.640 0.0361 -0.703 1.36
How about rowwise? You can specify the columns you want with c or c_across.
df %>%
rowwise() %>%
mutate(total = sum(c_across(ends_with("ko1"))))
# A tibble: 10 x 5
# Rowwise:
l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 total
<dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.179 0.496 -1.10 -0.375 -1.27
2 -0.0887 -0.873 0.613 -0.348 0.525
3 -2.33 -0.322 -0.515 3.03 -2.84
4 -0.602 -0.0387 0.704 -0.118 0.102
5 -0.389 -0.00801 0.276 0.500 -0.113
6 -2.18 0.648 -0.485 -0.243 -2.66
7 0.0529 0.237 -0.371 -0.0382 -0.318
8 0.818 -0.181 1.11 -1.25 1.93
9 -0.271 -0.883 0.480 -0.296 0.209
10 -0.208 -1.11 1.09 -0.528 0.882
This is a shortened version of my real df. I have a function (called: calc) which creates a new variable called 'total', for simplicity this adds up three variables: a, b, c. When I add a dataframe, to that function, that does not feature one variable (say c) so only has a & b, the function falls over. Is there a 'function' / simple way that counts the variables regardless if they are missing?
calc <- function(x) {x %>% mutate(total = a + b + c)}
data.2 has two columns a & b with many rows of values, but when running that in the function it cannot find c so does not calculate.
new.df <- calc(data.2)
Many thanks.
If you want to perform rowwise sum or mean they have na.rm argument which you can use to ignore NA values.
library(dplyr)
calc <- function(x) {x %>% mutate(total = rowSums(select(., a:c), na.rm = TRUE))}
In general case if you are not able to find a function which gives you an out-of-box solution you can replace NA values with 0 maybe and then perform the operation that you want to perform.
calc <- function(x) {
x %>%
mutate(across(a:c, tidyr::replace_na, 0),
total = a + b + c)
}
You can use rowwise() and c_across() with any_of() (or any other tidyselect function) from dplyr (>= 1.0.0).
library(dplyr)
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(any_of(c("a", "b", "c"))))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 -0.884 0.851 -0.0339
#> 2 -1.56 -0.464 -2.02
#> 3 -0.884 0.815 -0.0689
#> 4 -1.46 -0.259 -1.71
#> 5 0.211 -0.528 -0.317
#> 6 1.85 0.190 2.04
#> 7 -1.31 -0.921 -2.23
#> 8 0.450 0.394 0.845
#> 9 -1.14 0.428 -0.714
#> 10 -1.11 0.417 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.0868 0.632 1.81 2.36
#> 2 0.568 -0.523 0.240 0.286
#> 3 -0.0325 0.377 -0.437 -0.0921
#> 4 0.660 0.456 1.28 2.39
#> 5 -0.123 1.75 -1.03 0.599
#> 6 0.641 1.39 0.902 2.93
#> 7 0.266 0.520 0.904 1.69
#> 8 -1.53 0.319 0.439 -0.776
#> 9 0.942 0.468 -1.69 -0.277
#> 10 0.254 -0.600 -0.196 -0.542
If you want to be able to generalize beyond those 3 variables you can use any tidyselect methodology.
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(everything()))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 0.775 1.17 1.95
#> 2 -1.05 1.21 0.155
#> 3 2.07 -0.264 1.81
#> 4 1.11 0.793 1.90
#> 5 -0.700 -0.216 -0.916
#> 6 -1.04 -1.03 -2.07
#> 7 -0.525 1.60 1.07
#> 8 0.354 0.828 1.18
#> 9 0.126 0.110 0.236
#> 10 -0.0954 -0.603 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.616 0.767 0.0462 0.196
#> 2 -0.370 -0.538 -0.186 -1.09
#> 3 0.337 1.11 -0.700 0.751
#> 4 -0.993 -0.531 -0.984 -2.51
#> 5 0.0538 1.50 -0.0808 1.47
#> 6 -0.907 -1.54 -0.734 -3.18
#> 7 -1.65 -0.242 1.43 -0.455
#> 8 -0.166 0.447 -0.281 -0.000524
#> 9 0.0637 -0.0185 0.754 0.800
#> 10 1.81 -1.09 -2.15 -1.42
Created on 2020-09-10 by the reprex package (v0.3.0)
I need to sum columns in a dataframe where the columns that need to be summed are defined in a separate data frame. Reproducible example below.
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1))
cols_to_sum <- tibble(col1 = c("L1","L2"),
col2 = c("L3","L4"))
In the example above I need to create two additional columns in dataset, one called "L1L3" which is the sum of L1 and L3 and similar for L2 and L4. The desired output should look like the dataframe below. The cols_to_sum dataframe could have any number of rows and the dataset could have any number of columns.
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1)) %>%
mutate(L1L3 = L1 + L3,
L2L4 = L2 + L4)
One option involving dplyr and purrr could be:
map_dfc(.x = asplit(cols_to_sum, 1), ~ dataset %>%
mutate(!!paste(paste(.x, collapse = "_"), "sum", sep = "_") := rowSums(select(., .x))) %>%
select(ends_with("sum"))) %>%
bind_cols(dataset)
L1_L3_sum L2_L4_sum L1 L2 L3 L4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1.42 1.79 0.621 0.878 0.802 0.908
2 0.944 1.39 0.135 0.527 0.809 0.864
3 1.16 0.859 0.607 0.361 0.555 0.498
4 1.71 1.10 0.982 0.853 0.729 0.252
5 0.856 0.950 0.287 0.0234 0.568 0.927
6 0.235 1.16 0.00368 0.363 0.232 0.801
7 1.27 1.24 0.516 0.601 0.755 0.637
8 1.37 1.38 0.486 0.914 0.882 0.465
9 0.368 1.12 0.168 0.642 0.200 0.482
10 0.341 1.33 0.317 0.477 0.0240 0.857
More sequentally you can create a function to pass the character evaluation you want to evaluate, as in here. The code would be as follows:
library(tidyverse)
library(rlang)
library(dplyr)
library(tidyr)
# You create the function
example_fun <- function(df, new_var, expression) {
df %>%
mutate(!! new_var := !! parse_expr(expression))
}
example_fun(new_var, expression)
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1))
#Transform it to dataframe
cols_to_sum <- tibble(col1 = c("L1","L2"),
col2 = c("L3","L4"))%>% as.data.frame()
# apply by column the rule of summing
for(i in 1:ncol(cols_to_sum)){
expressionsum <- paste(as.character(cols_to_sum[,i]), collapse = "+",sep ="")
Newvar <-paste(as.character(cols_to_sum[,i]), collapse = "")
dataset <- example_fun(dataset, Newvar, expressionsum)
}
dataset
# # A tibble: 100 x 6
# L1 L2 L3 L4 L1L2 L3L4
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.550 0.209 0.331 0.000826 0.759 0.332
# 2 0.503 0.587 0.918 0.0305 1.09 0.948
# 3 0.0269 0.223 0.310 0.539 0.250 0.850
# 4 0.622 0.0543 0.887 0.322 0.676 1.21
# 5 0.748 0.784 0.830 0.0694 1.53 0.899
# 6 0.374 0.416 0.688 0.520 0.791 1.21
# 7 0.524 0.603 0.884 0.0563 1.13 0.941
# 8 0.774 0.640 0.117 0.0622 1.41 0.180
# 9 0.954 0.868 0.809 0.429 1.82 1.24
# 10 0.606 0.833 0.310 0.894 1.44 1.20
# # … with 90 more rows
Here is one base R solution which combines the columns you want to sum for the column names, and uses subsetting and rowSums() within lapply() to add up your columns:
dataset[sapply(cols_to_sum, paste0, collapse = "")] <- lapply(cols_to_sum, function(x) rowSums(dataset[x]))
dataset
# A tibble: 100 x 6
L1 L2 L3 L4 L1L2 L3L4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.915 0.626 0.885 0.484 1.54 1.37
2 0.937 0.217 0.517 0.445 1.15 0.962
3 0.286 0.217 0.852 0.0604 0.503 0.912
4 0.830 0.389 0.443 0.328 1.22 0.770
5 0.642 0.942 0.158 0.878 1.58 1.04
6 0.519 0.963 0.442 0.931 1.48 1.37
7 0.737 0.740 0.968 0.392 1.48 1.36
8 0.135 0.733 0.485 0.159 0.868 0.643
9 0.657 0.536 0.252 0.320 1.19 0.572
10 0.705 0.00227 0.260 0.307 0.707 0.567
Data:
set.seed(42)
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1))
cols_to_sum <- tibble(col1 = c("L1","L2"),
col2 = c("L3","L4"))
[enter image description here][1]Dear all,
I have a very large file (14,566,680 records) with 2 variables (ID and A).
The first variable (ID) is the individual (n=258) and each individual has 56,460 records (A)
I would like to write out a "transpose" file (i.e. 258 lines & 54460 columns).
When I execute the following code:
system.time(snp1 %>%
#filter(`Sample ID`=='8362974') %>%
select(`Sample ID`,A) %>%
mutate(id = row_number()) %>%
#head(n=nsnp) %>%
pivot_wider(names_from=id,
values_from = A)->T)
I got the following error:
Error in rep_len(NA_integer_, n) : invalid 'length.out' value
In addition: Warning message:
In nrow * ncol : NAs produced by integer overflow
Timing stopped at: 28.73 0.62 29.36
If I use only 1 ID it works correctly
Best
Stefano
Does it work if you group the records by individual before calculating the row_number (record ID)?
# made up sample
df <- tibble(`Sample ID` = rep(1:258, each = 56460)) %>%
mutate(A = rnorm(nrow(.)))
df %>%
group_by(`Sample ID`) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from=id,
values_from = A)
# A tibble: 258 x 56,461
# Groups: Sample ID [258]
`Sample ID` `1` `2` `3` `4` `5` `6` `7`
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1.49 0.546 0.0517 -0.480 -0.500 0.266 -1.52
2 2 -0.391 -0.855 -1.28 -0.0277 -0.999 0.617 -0.415
3 3 0.200 0.484 1.08 -0.568 1.16 1.75 -0.143
4 4 0.212 0.371 0.674 0.0481 -1.09 -1.07 0.160
5 5 0.409 1.54 0.931 -0.280 1.27 0.0447 0.426
6 6 -0.936 0.903 -0.0408 0.590 -1.52 -1.14 -0.600
7 7 -1.97 0.336 -0.233 0.488 0.995 -0.933 -1.90
8 8 -0.396 2.12 1.10 0.304 0.290 0.595 -1.32
9 9 -1.31 -0.124 -0.804 -0.447 1.12 -0.721 0.378
10 10 0.977 0.818 1.51 -0.258 -0.00794 0.0386 2.03
# ... with 248 more rows, and 56,453 more variables: ...
A data frame have contains three variables:
from - character - the name of a measure
to - character - the name of another measure
covariance - numeric - the covariance between the two measures
Here's a link to the data. Below is the result of head(have):
from to covariance
a_airportscreener a_airportscreener 4.419285714
a_airportscreener e_airportscreener -1.328928571
a_airportscreener g_airportscreener -3.038928571
a_airportscreener p_airportscreener 0.3292857143
a_airportscreener pres_airportscreener 0.6452857143
a_automechanic a_automechanic 2.635535714
a_automechanic e_automechanic -0.3439285714
I want to create a data frame called need that records the covariances between prefixed versions of the same job title in separate columns. For example, the first row would look like:
job a_a a_e a_g a_p a_pres e_a e_e e_g e_p e_pres g_a g_e g_g g_p g_pres p_a p_e p_g p_p p_pres pres_a pres_e pres_g pres_p pres_pres
airportscreener 4.419 -1.329 -3.039 0.329 0.645 -1.329 2.333 2.441 -1.015 0.659 -3.039 2.441 14.253 3.070 0.977 0.329 -1.015 3.070 6.505 0.366 0.645 0.659 0.977 0.366 0.697
(I rounded the values in have to keep the example of need on the page, but this is not part of the question.)
Try this approach on your complete data
library(tidyverse)
cov_mat %>%
rownames_to_column() %>%
pivot_longer(cols =-rowname) %>%
mutate(key = paste0(sub("_.*", "\\1", name), "_", sub("_.*", "\\1", rowname)),
rowname = sub(".*_(.*)_.*", "\\1", rowname),
name = sub(".*_(.*)_.*", "\\1", name)) %>%
filter(rowname == name) %>%
select(-rowname) %>%
pivot_wider(names_from = key, values_from = value)
# A tibble: 58 x 26
# name a_a e_a g_a p_a pres_a a_e e_e g_e .....
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 airp… 4.42 -1.33 -3.04 0.329 0.645 -1.33 2.33 2.44
# 2 auto… 2.64 -0.344 6.26 -0.712 -0.595 -0.344 0.499 0.113
# 3 auto… 2.67 -0.466 2.36 -0.106 -0.878 -0.466 0.72 -5.95
# 4 blkj… 2.50 0.529 -6.79 0.0129 -0.0666 0.529 1.56 -8.58
# 5 blkt… 1.04 -0.00143 4.86 0.993 -0.194 -0.00143 0.229 -1.69
# 6 brid… 4.15 2.05 -11.5 -1.21 0.453 2.05 2.05 -9.09
# 7 cart… 1.79 0.458 -4.22 0.451 -0.410 0.458 1.23 3.54
# 8 chem… 2.29 0.479 12.4 -0.0384 -0.164 0.479 0.811 2.15
# 9 clth… 4.10 1.15 -18.9 1.77 0.728 1.15 1.7 -4.00
#10 coag… 2.23 -0.382 -7.79 -0.0190 0.460 -0.382 0.342 4.11
This is not as elegant as #Ronak Shah's answer, but I had been working on something similar, and thought it might be worth sharing for someone out there. It also uses pivot_longer and pivot_wider in latest tidyr.
library(readxl)
library(tidyr)
library(dplyr)
df <- read_excel("cov_data.xlsx")
need <- df %>%
separate(from, into = c('from1', 'job'), sep = '_') %>%
separate(to, into = 'to1', extra = 'drop', sep = '_') %>%
unite(comb1, from1, to1, remove = F) %>%
unite(comb2, to1, from1, remove = T) %>%
pivot_longer(c(comb1, comb2)) %>%
dplyr::select(-name) %>%
distinct() %>%
pivot_wider(names_from = value, values_from = covariance) %>%
dplyr::select(job, order(colnames(.)))
# A tibble: 58 x 26
job a_a a_e a_g a_p a_pres e_a e_e e_g e_p e_pres g_a g_e g_g g_p g_pres p_a p_e p_g
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 airp… 4.42 -1.33 -3.04 0.329 0.645 -1.33 2.33 2.44 -1.02 0.659 -3.04 2.44 14.3 3.07 0.977 0.329 -1.02 3.07
2 auto… 2.64 -0.344 6.26 -0.712 -0.595 -0.344 0.499 0.113 0.891 0.321 6.26 0.113 203. 5.16 0.645 -0.712 0.891 5.16
3 auto… 2.67 -0.466 2.36 -0.106 -0.878 -0.466 0.72 -5.95 0.431 0.194 2.36 -5.95 252. 4.65 -4.64 -0.106 0.431 4.65
4 blkj… 2.50 0.529 -6.79 0.0129 -0.0666 0.529 1.56 -8.58 -0.703 0.384 -6.79 -8.58 247. 2.11 1.68 0.0129 -0.703 2.11
5 blkt… 1.04 -0.00143 4.86 0.993 -0.194 -0.00143 0.229 -1.69 0.276 -0.0351 4.86 -1.69 260. 14.3 2.44 0.993 0.276 14.3
6 brid… 4.15 2.05 -11.5 -1.21 0.453 2.05 2.05 -9.09 -0.342 0.576 -11.5 -9.09 326. -2.07 0.992 -1.21 -0.342 -2.07
7 cart… 1.79 0.458 -4.22 0.451 -0.410 0.458 1.23 3.54 0.43 -0.0674 -4.22 3.54 478. 10.5 -1.21 0.451 0.43 10.5
8 chem… 2.29 0.479 12.4 -0.0384 -0.164 0.479 0.811 2.15 0.784 0.0469 12.4 2.15 238. 2.58 -2.05 -0.0384 0.784 2.58
9 clth… 4.10 1.15 -18.9 1.77 0.728 1.15 1.7 -4.00 1.65 0.133 -18.9 -4.00 193. -17.1 -6.81 1.77 1.65 -17.1
10 coag… 2.23 -0.382 -7.79 -0.0190 0.460 -0.382 0.342 4.11 0.161 0.0398 -7.79 4.11 444. 1.96 -7.55 -0.0190 0.161 1.96