Converting a list of named list to data frame - r

I have a list returned by sapply which looks like this:
> my_list
[,1] [,2] [,3] [,4]
val 1.73 2.73 4.71 5.27
cost 10.1 8.71 9.95 0.01
time 5.36 5.84 5.68 2.10
I'd like to convert it into a data frame:
id
val
cost
time
1
1.73
10.1
5.36
2
2.73
8.71
5.84
3
4.71
9.95
5.68
4
5.27
0.01
2.10
How can I transform the list into the data frame this way?
Edit: Here is the output of dput(my_list):
structure(list(1.73, 10.1, 5.36,2.73,8.71,5.84,
4.71,9.95,5.68, 5.27, 0.01, 2.10),
dim = c(3L, 4L), dimnames = list(c("val",
"cost", "time"), NULL))

Use t
t(dat) |>
transform(id = seq(ncol(dat)))
val cost time id
1 1.73 10.1 5.36 1
2 2.73 8.71 5.84 2
3 4.71 9.95 5.68 3
4 5.27 0.01 2.1 4

The elements in the data are list elements, if we want to make it regular vectors, an option is unnest
library(dplyr)
library(tidyr)
t(my_list) %>%
as_tibble %>%
unnest(where(is.list)) %>%
mutate(id = row_number(), .before = 1)
-output
# A tibble: 4 × 4
id val cost time
<int> <dbl> <dbl> <dbl>
1 1 1.73 10.1 5.36
2 2 2.73 8.71 5.84
3 3 4.71 9.95 5.68
4 4 5.27 0.01 2.1

Related

avoid repeated unquoting in dplyr non standard evaluation

Suppose we have the following data:
tib <- tibble::tibble(x = 1:10)
Then, suppose we want to make a function that takes a column as input and returns a tibble with several added columns such as:
library(dplyr)
generate_transformations <- function(data, column){
transform <- sym(column)
data %>%
mutate(
sqrt = sqrt(!!transform),
recip = 1 / !!transform,
log = log(!!transform)
)
}
# Usage is great:
tib %>%
generate_transformations('x')
# A tibble: 10 x 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
Now my question is, is there a way to avoid unquoting (!!) transform repeatedly?
Yes, I could, e.g., temporarily rename column and then rename it back after I am done, but that is not my interest in this question.
I am interested if there is a way to produce a variable that does not need the !!.
While it does not work, I was looking for something like:
generate_transformations <- function(data, column){
transform <- !!sym(column) # cannot unquote here :(
data %>%
mutate(
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform)
)
}
Convert to string and subset from the data and use transform
generate_transformations <- function(data, column){
transform <- data[[rlang::as_string(ensym(column))]]
data %>%
mutate(
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform)
)
}
-testing
tib %>%
generate_transformations('x')
# A tibble: 10 × 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
Or create a temporary column and remove it later
generate_transformations <- function(data, column){
data %>%
mutate(transform = !! rlang::ensym(column),
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform),
transform = NULL
)
}
-testing
tib %>%
generate_transformations('x')
# A tibble: 10 × 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
You can do it in one, if you swap !! for {{}} and use across:
data_transformations <- function(d, col, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{col}}, .fns=funs))
}
d %>% data_transformations(x)
# A tibble: 10 × 4
x x_sqrt x_log x_recip
<int> <dbl> <dbl> <dbl>
1 1 1 0 1
2 2 1.41 0.693 0.5
3 3 1.73 1.10 0.333
4 4 2 1.39 0.25
5 5 2.24 1.61 0.2
6 6 2.45 1.79 0.167
7 7 2.65 1.95 0.143
8 8 2.83 2.08 0.125
9 9 3 2.20 0.111
10 10 3.16 2.30 0.1
To restore your original column names, use
data_transformations <- function(d, col, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{col}}, .fns=funs, .names="{.fn}"))
}
d %>% data_transformations(x)
# A tibble: 10 × 4
x sqrt log recip
<int> <dbl> <dbl> <dbl>
1 1 1 0 1
2 2 1.41 0.693 0.5
3 3 1.73 1.10 0.333
4 4 2 1.39 0.25
5 5 2.24 1.61 0.2
6 6 2.45 1.79 0.167
7 7 2.65 1.95 0.143
8 8 2.83 2.08 0.125
9 9 3 2.20 0.111
10 10 3.16 2.30 0.1
To handle multiple columns:
data_transformations <- function(d, cols, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{cols}}, .fns=funs))
}
d1 <- tibble(x=1:10, y=seq(2, 20, 2))
d1 %>% data_transformations(c(x, y), list(sqrt=sqrt, log=log))
A tibble: 10 × 6
x y x_sqrt x_log y_sqrt y_log
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 1 0 1.41 0.693
2 2 4 1.41 0.693 2 1.39
3 3 6 1.73 1.10 2.45 1.79
4 4 8 2 1.39 2.83 2.08
5 5 10 2.24 1.61 3.16 2.30
6 6 12 2.45 1.79 3.46 2.48
7 7 14 2.65 1.95 3.74 2.64
8 8 16 2.83 2.08 4 2.77
9 9 18 3 2.20 4.24 2.89
10 10 20 3.16 2.30 4.47 3.00

pivot_longer on a mix of matrix columns and regular vector columns

I have a tibble where some columns are matrices. Here's a toy example:
library(dplyr)
library(tidyr)
dat <- structure(list(id = 0:5, matrix_column = structure(c(-1.34333431222985,
-1.54123232044003, -1.7260282725816, -1.8924463753132, -2.0376516335872,
-2.16069643164938, -0.250406602741403, -0.287716094522968, -0.32269823315914,
-0.354360193430544, -0.382155662949252, -0.405883260458378, 1.53709630050992,
1.76715755374983, 1.98313378488307, 2.17881959842109, 2.35072520728221,
2.4974704619887), .Dim = c(6L, 3L)), vector_column = c(10.453112322311,
10.3019556236512, 10.1273409693709, 9.91474471968391, 9.65093549479026,
9.32601906868098)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
The tibble looks like this.
> dat
# A tibble: 6 x 3
id matrix_column[,1] [,2] [,3] vector_column
<int> <dbl> <dbl> <dbl> <dbl>
1 0 -1.34 -0.250 1.54 10.5
2 1 -1.54 -0.288 1.77 10.3
3 2 -1.73 -0.323 1.98 10.1
4 3 -1.89 -0.354 2.18 9.91
5 4 -2.04 -0.382 2.35 9.65
6 5 -2.16 -0.406 2.50 9.33
If I apply pivot_longer from tidyr to the non-id columns, the values in vector_column get replicated to fill the two additional columns required to accommodate matrix_column.
dat %>%
pivot_longer(cols = -id, values_to = "new_column")
# A tibble: 12 x 3
id name new_column[,1] [,2] [,3]
<int> <chr> <dbl> <dbl> <dbl>
1 0 matrix_column -1.34 -0.250 1.54
2 0 vector_column 10.5 10.5 10.5
3 1 matrix_column -1.54 -0.288 1.77
4 1 vector_column 10.3 10.3 10.3
5 2 matrix_column -1.73 -0.323 1.98
6 2 vector_column 10.1 10.1 10.1
7 3 matrix_column -1.89 -0.354 2.18
8 3 vector_column 9.91 9.91 9.91
9 4 matrix_column -2.04 -0.382 2.35
10 4 vector_column 9.65 9.65 9.65
11 5 matrix_column -2.16 -0.406 2.50
12 5 vector_column 9.33 9.33 9.33
Is there a way to have the [,2] and the [,3] columns of new_column to be NA (instead of the same value of [,1]) when name equals vector_column?
Something like
# A tibble: 12 x 3
id name new_column[,1] [,2] [,3]
<int> <chr> <dbl> <dbl> <dbl>
1 0 matrix_column -1.34 -0.250 1.54
2 0 vector_column 10.5 NA NA
3 1 matrix_column -1.54 -0.288 1.77
4 1 vector_column 10.3 NA NA
My real life data have dozens of matrix columns and vector columns.
If you continue with the format of data that you currently have (having dataframe and matrix together) you'll keep on running into trouble to work with it. I would suggest to convert the matrix into dataframe and add them as their separate columns.
library(dplyr)
library(tidyr)
dat$matrix_column %>%
data.frame() %>%
bind_cols(dat %>% select(-matrix_column)) %>%
pivot_longer(cols = -id, values_to = "new_column")
# id name new_column
# <int> <chr> <dbl>
# 1 0 X1 -1.34
# 2 0 X2 -0.250
# 3 0 X3 1.54
# 4 0 vector_column 10.5
# 5 1 X1 -1.54
# 6 1 X2 -0.288
# 7 1 X3 1.77
# 8 1 vector_column 10.3
# 9 2 X1 -1.73
#10 2 X2 -0.323
# … with 14 more rows

Using pivot_longer in tidyr with a complex separator [duplicate]

This question already has an answer here:
How to use Pivot_longer to reshape from wide-type data to long-type data with multiple variables
(1 answer)
Closed 2 years ago.
In a previous post here I tried to get the equivalent of an rbind using tidyr::pivotlonger(). This is the data and the solution.
set.seed(1)
df1 <- data.frame(group = rep(letters[1:2],each=3),
day = rep(1:3,2),
var1_mean = round(rnorm(6),2),
var1_sd = round(rnorm(6,5),2),
var2_mean = round(rnorm(6),2),
var2_sd = round(rnorm(6,5),2))
# group day var1_mean var1_sd var2_mean var2_sd
# 1 a 1 -0.63 5.49 -0.62 5.82
# 2 a 2 0.18 5.74 -2.21 5.59
# 3 a 3 -0.84 5.58 1.12 5.92
# 4 b 1 1.60 4.69 -0.04 5.78
# 5 b 2 0.33 6.51 -0.02 5.07
# 6 b 3 -0.82 5.39 0.94 3.01
df1 %>%
pivot_longer(cols = starts_with('var'),
names_to = c('grp', '.value'),
names_sep="_")
# group day grp mean sd
# <fct> <int> <chr> <dbl> <dbl>
# 1 a 1 var1 -0.63 5.49
# 2 a 1 var2 -0.62 5.82
# 3 a 2 var1 0.18 5.74
# 4 a 2 var2 -2.21 5.59
# 5 a 3 var1 -0.84 5.58
# 6 a 3 var2 1.12 5.92
# 7 b 1 var1 1.6 4.69
# 8 b 1 var2 -0.04 5.78
# 9 b 2 var1 0.33 6.51
# 10 b 2 var2 -0.02 5.07
# 11 b 3 var1 -0.82 5.39
# 12 b 3 var2 0.94 3.01
This solution is quite contingent on the naming convention used for the mean and sd variables. If there is a different naming convention, with a more complex separator between the two important nodes of the column names, like so...
df2 <- data.frame(group = rep(letters[1:2],each=3),
day = rep(1:3,2),
mean_var_1 = round(rnorm(6),2),
sd_var_1 = round(rnorm(6,5),2),
mean_var_2 = round(rnorm(6),2),
sd_var_2 = round(rnorm(6,5),2))
df2
# group day mean_var_1 sd_var_1 mean_var_2 sd_var_2
# 1 a 1 0.62 6.36 -0.39 5.70
# 2 a 2 -0.06 4.90 -0.06 5.56
# 3 a 3 -0.16 5.39 1.10 4.31
# 4 b 1 -1.47 4.95 0.76 4.29
# 5 b 2 -0.48 3.62 -0.16 5.36
# 6 b 3 0.42 4.59 -0.25 5.77
How would I achieve a similar result to the first example, with a single mean and sd column and with var_1 and var_2 as the grouping variable?
If you have names that are complicated you can use names_pattern argument where you can specify how each part of column name would be used to get data in long format.
tidyr::pivot_longer(df2,
cols = contains('var'),
names_to = c('.value', 'grp'),
names_pattern = '(.*?)_(.*)')
# group day grp mean sd
# <chr> <int> <chr> <dbl> <dbl>
# 1 a 1 var_1 0.62 6.36
# 2 a 1 var_2 -0.39 5.7
# 3 a 2 var_1 -0.06 4.9
# 4 a 2 var_2 -0.06 5.56
# 5 a 3 var_1 -0.16 5.39
# 6 a 3 var_2 1.1 4.31
# 7 b 1 var_1 -1.47 4.95
# 8 b 1 var_2 0.76 4.29
# 9 b 2 var_1 -0.48 3.62
#10 b 2 var_2 -0.16 5.36
#11 b 3 var_1 0.42 4.59
#12 b 3 var_2 -0.25 5.77
'(.*?)_(.*)' uses two groups of data where the first group is everything until the first underscore ((.*?)) in the column name and the second group is everything after the underscore following the first group ((.*)).

Multiply part of the data in data.frame by values in another data.frame

Someone here already kindly provided part of the following code:
library(dplyr)
set.seed(12345)
df1 = data.frame(a=c(rep("a",8), rep("b",5), rep("c",7), rep("d",10)),
b=rnorm(30, 6, 2),
c=rnorm(30, 12, 3.5),
d=rnorm(30, 8, 3)
)
df2 = data.frame(b= 1.5,
c= 13,
d= 0.34
)
df1_z <- df1 %>%
group_by(a) %>%
mutate(across(b:d, list(zscore = ~as.numeric(scale(.))))) %>%
ungroup %>%
mutate(total = rowSums(select(., ends_with('zscore'))))
This was exactly what I wanted at the time, but now I would like something slightly different. In df1_z, instead of the values in the last column called "total", I would like this value to be the sum of the multiplications of the values in the _zscore column and the corresponding values in df2, so: b_zscore x 1.5 + c_zscore x 13 + d_zscore x 0.34.
For example, the first value would be 0.6971403 x 1.5 + 0.100595417 x 13 + 0.01790090 x 0.34 = 2.359537177. Expected outcome for the new total column:
total
2.359537177
16.04147765
13.64141872
9.146152274
-3.380574542
-5.55439223
etc...
How to modify above code to get this result in the new "total" column of df1_z?
You could use the crossprod function:
df1 %>%
group_by(a) %>%
mutate(across(b:d, list(zscore = ~as.numeric(scale(.))))) %>%
ungroup %>%
mutate(total = c(crossprod(t(select(., ends_with('zscore'))),t(df2))))
# A tibble: 30 x 8
a b c d b_zscore c_zscore d_zscore total
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 7.17 14.8 8.45 0.697 0.101 0.0179 2.36
2 a 7.42 19.7 3.97 0.841 1.17 -1.14 16.0
3 a 5.78 19.2 9.66 -0.108 1.05 0.332 13.6
4 a 5.09 17.7 12.8 -0.508 0.732 1.14 9.15
5 a 7.21 12.9 6.24 0.721 -0.329 -0.555 -3.38
6 a 2.36 13.7 2.50 -2.09 -0.146 -1.52 -5.55
7 a 7.26 10.9 10.7 0.749 -0.774 0.593 -8.74
8 a 5.45 6.18 12.8 -0.302 -1.80 1.14 -23.5
9 b 5.43 18.2 9.55 -0.445 1.12 1.34 14.4
10 b 4.16 12.1 4.11 -1.06 0.0776 -1.02 -0.933
# ... with 20 more rows
Another option:
library(tidyverse)
df1 %>%
group_by(a) %>%
mutate(across(b:d, list(zscore = ~as.numeric(scale(.))))) %>%
ungroup %>%
mutate(total = rowSums(map2_dfc(select(., contains('zscore')), df2, `*`)))
Output:
# A tibble: 30 x 8
a b c d b_zscore c_zscore d_zscore total
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 7.17 14.8 8.45 0.697 0.101 0.0179 2.36
2 a 7.42 19.7 3.97 0.841 1.17 -1.14 16.0
3 a 5.78 19.2 9.66 -0.108 1.05 0.332 13.6
4 a 5.09 17.7 12.8 -0.508 0.732 1.14 9.15
5 a 7.21 12.9 6.24 0.721 -0.329 -0.555 -3.38
6 a 2.36 13.7 2.50 -2.09 -0.146 -1.52 -5.55
7 a 7.26 10.9 10.7 0.749 -0.774 0.593 -8.74
8 a 5.45 6.18 12.8 -0.302 -1.80 1.14 -23.5
9 b 5.43 18.2 9.55 -0.445 1.12 1.34 14.4
10 b 4.16 12.1 4.11 -1.06 0.0776 -1.02 -0.933
# ... with 20 more rows

Provide tibble names in purrr

I would like to know if it is possible to provide column names in the as_tibble function. I know that I could use the rename function to change column names, but I would like to save the number of lines I write. Lets say I want my column names to be a1, a2, a3.
> library(purrr)
> library(tidyverse)
> 1:3 %>%
+ map(~ rnorm(104, .x)) %>%
+ map_dfc(~as_tibble(.x))
# A tibble: 104 x 3
value value1 value2
<dbl> <dbl> <dbl>
1 2.91139409 1.44646163 1.298360
2 0.87725704 4.05341889 3.892296
3 0.73230088 2.72506579 3.520865
4 1.02862344 2.09576397 4.009980
5 0.49159059 -1.23746772 3.172201
6 0.24665840 1.80876495 2.927716
7 0.75112051 2.22486452 2.896452
8 -0.06036349 3.63503054 3.218324
9 1.84431314 1.88562406 2.398761
10 0.70866474 0.08947359 3.954770
# ... with 94 more rows
We can put as_tibble with map_dfc, and then use setNames(paste0("a", seq_len(ncol(.)))) to change column name based on the number of columns.
library(tidyverse)
set.seed(123)
1:3 %>%
map_dfc(~as_tibble(rnorm(104, .x))) %>%
setNames(paste0("a", seq_len(ncol(.))))
# A tibble: 104 x 3
a1 a2 a3
<dbl> <dbl> <dbl>
1 0.440 1.05 4.65
2 0.770 1.95 2.95
3 2.56 1.22 3.12
4 1.07 0.332 3.24
5 1.13 1.62 4.23
6 2.72 2.92 2.48
7 1.46 1.42 2.01
8 -0.265 2.61 4.68
9 0.313 0.382 2.56
10 0.554 1.94 2.28
# ... with 94 more rows

Resources