Reshape long data table to list of wide data tables - r

My question is an expansion of the question posed here
How to reshape data from long to wide format so I will phrase it in a similar way.
The difference is that I want to rearrange one long data table into a list of wide data tables.
dat <- data.table(
sim = rep(c(1,2), each=4),
time = rep(1:4, 2),
value1 = rnorm(8),
value2 = rnorm(8)
)
dat
sim time value1 value2
1 1 1 0.3407 0.5167
2 1 2 -0.7033 0.8416
3 1 3 -0.3795 -0.4717
4 1 4 -0.7460 0.8479
5 2 1 0.8981 -0.7163
6 2 2 -0.3347 -0.6849
7 2 3 0.5013 0.8941
8 2 4 -0.1745 0.0795
I want to reshape it so that I have a list of wide data tables named value1, value2 ... value99 etc...
l = list()
l[["value1"]]
sim 1 2 3 4
1 1 0.3407 -0.7033 -0.3795 -0.7460
5 2 -0.8981 -0.3347 -0.5013 -0.1745
l[["value2"]]
sim 1 2 3 4
1 1 0.5167 0.8416 -0.4717 0.8479
5 2 -0.7163 -0.6849 0.8941 0.0795

Two variants.
data.table
library(data.table)
tmp <- dcast(melt(as.data.table(dat), id = c("sim", "time")), sim + variable ~ time)
tmp <- split(tmp, tmp$variable)
tmp <- lapply(tmp, set, i = NULL, j = "variable", value = NULL)
tmp
# $value1
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 1.0458737762 -0.4845954 0.1891288 0.05100633
# 2: 2 -0.0002406689 1.8093820 -0.8253280 1.14547045
# $value2
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 0.03157319 -0.8352058 -0.06876365 0.7467717
# 2: 2 -0.42551873 -0.7720822 0.15276411 0.9885968
I often use magrittr::%>% with data.table as well, so that can be converted into
library(data.table)
library(magrittr) # if %>% is not already available
as.data.table(dat) %>%
melt(., id = c("sim", "time")) %>%
dcast(., sim + variable ~ time) %>%
split(., .$variable) %>%
lapply(., set, i = NULL, j = "variable", value = NULL)
# $value1
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 1.0458737762 -0.4845954 0.1891288 0.05100633
# 2: 2 -0.0002406689 1.8093820 -0.8253280 1.14547045
# $value2
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 0.03157319 -0.8352058 -0.06876365 0.7467717
# 2: 2 -0.42551873 -0.7720822 0.15276411 0.9885968
tidyverse
library(dplyr)
library(tidyr) # pivot_longer, pivot_wider
dat %>%
pivot_longer(., -c(sim, time)) %>%
pivot_wider(., names_from = time, values_from = value) %>%
split(., .$name) %>%
lapply(., select, -name)
# $value1
# # A tibble: 2 x 5
# sim `1` `2` `3` `4`
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1.05 -0.485 0.189 0.0510
# 2 2 -0.000241 1.81 -0.825 1.15
# $value2
# # A tibble: 2 x 5
# sim `1` `2` `3` `4`
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 0.0316 -0.835 -0.0688 0.747
# 2 2 -0.426 -0.772 0.153 0.989

My solution to this issue would be to create a nested datafrae of the results. I have provided a brief description of the method followed by a reprex.
I would do this by using pivot_wider() and pivot_longer() to reshape the data. pivot_longer is used first to make each row only contain 1 value with a label for the time, simulation and whether it is value one or two. Then using pivot_wider each row will contain the values at each time with a label for the simulation and which set of values they are. (value1 or value2).
Finally we nest the dataframe using nest which stores all the data for each set of values in a dataframe. This can be accessed as an array of dataframes by nested_vals$data if necessary where nested_vals is the object we assigned the nested dataframe to.
library(tidyverse)
#Setup data
dat <- data.frame(
sim = rep(c(1,2), each=4),
time = rep(1:4, 2),
value1 = rnorm(8),
value2 = rnorm(8)
)
# Construct nested dataframe
nested_vals <- dat %>%
# Format dataset in tidy format
pivot_longer(cols = c(value1, value2)) %>%
# Move the name of the data to the beginning of the dataframe
relocate(name) %>%
# Pivot to matrix form as requested (i.e. times as columns, sims as rows)
pivot_wider(id_cols = c(name, sim), names_from = time, values_from = value) %>%
# Nest results by name
nest(-name)
#> Warning: All elements of `...` must be named.
#> Did you want `data = c(sim, `1`, `2`, `3`, `4`)`?
nested_vals
#> # A tibble: 2 x 2
#> name data
#> <chr> <list>
#> 1 value1 <tibble[,5] [2 x 5]>
#> 2 value2 <tibble[,5] [2 x 5]>
nested_vals$data[[2]]
#> # A tibble: 2 x 5
#> sim `1` `2` `3` `4`
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.0639 0.250 -1.28 0.850
#> 2 2 -1.90 0.000421 0.704 -0.164
Created on 2021-04-07 by the reprex package (v2.0.0)

One more way, with a single pipe syntax
library(tidyverse)
dat %>% pivot_longer(c(value1, value2)) %>%
group_split(name) %>% setNames(map(., ~.x[[3]][1])) %>%
map(~ .x %>% pivot_wider(id_cols = sim, names_from = time, values_from = value))
$value1
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -0.851 -0.0484 -0.656 -0.121
2 2 -0.645 1.59 -0.274 0.445
$value2
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1.46 -1.62 -0.672 1.43
2 2 1.65 0.790 0.495 0.162

Another approach:
library(dplyr)
library(tidyr)
wide_dat <- dat %>% pivot_wider(id_cols = sim, names_from = time, values_from = starts_with('value'))
lapply(lapply(split.default(wide_dat[-1], sub('_\\d','',names(wide_dat[-1]))), function(x) cbind(wide_dat[1],x)), setNames, c('sim', 1:4))
$value1
sim 1 2 3 4
1 1 -0.1704969 0.2820143 1.181898 2.2377396
2 2 2.1920534 0.8214070 0.421177 0.7601796
$value2
sim 1 2 3 4
1 1 0.1760887 0.3440053 -0.8435849 0.6729751
2 2 -0.1714095 1.5125986 -0.5739871 -0.9648294

A tidyverse solution could be:
library(dplyr)
library(purrr)
library(tidyr)
dat_longer <- dat %>%
tidyr::pivot_longer(starts_with("value"), names_to="col_name", values_to="values")
list_wide <- purrr::map(unique(dat_longer[["col_name"]]),
~dat_longer %>%
dplyr::filter(col_name==.x) %>%
tidyr::pivot_wider(values_from = "values", names_from="time") %>%
select(-col_name)) %>%
purrr::set_names(unique(dat_longer[["col_name"]]))
$value1
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -0.710 -0.334 -0.370 0.777
2 2 0.130 0.877 1.24 -0.202
$value2
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -0.719 -0.909 0.0821 -0.158
2 2 -0.706 1.51 0.234 1.09

Related

Passing multiple columns from function's argument to group_by

Consider the following example:
library(tidyverse)
df <- tibble(
cat = rep(1:2, times = 4, each = 2),
loc = rep(c("a", "b"), each = 8),
value = rnorm(16)
)
df %>%
group_by(cat, loc) %>%
summarise(mean = mean(value), .groups = "drop")
# # A tibble: 4 x 3
# cat loc mean
# * <int> <chr> <dbl>
# 1 1 a -0.563
# 2 1 b -0.394
# 3 2 a 0.159
# 4 2 b 0.212
I would like to make a function of the last two lines that takes a group argument to pass multiple columns to group_by.
Here's a dummy function that computes the mean values by a group of columns as an example:
group_mean <- function(data, col_value, group) {
data %>%
group_by(across(all_of(group))) %>%
summarise(mean = mean({{col_value}}), .groups = "drop")
}
group_mean(df, value, c("cat", "loc"))
# # A tibble: 4 x 3
# cat loc mean
# * <int> <chr> <dbl>
# 1 1 a -0.563
# 2 1 b -0.394
# 3 2 a 0.159
# 4 2 b 0.212
The function works but I would prefer a tidyselect/rlang approach to avoid quoting column names, like so:
group_mean(df, value, c(cat, loc))
# Error: Problem adding computed columns in `group_by()`.
# x Problem with `mutate()` input `..1`.
# x object 'loc' not found
# ℹ Input `..1` is `across(all_of(c(cat, loc)))`.
Enclosing group in {{}} works for a single column but not for multiple columns. How can I do that?
Consider using ... and then we can have the option to use either quoted or unquoted after converting to symbol with ensym
group_mean <- function(data, col_value, ...) {
data %>%
group_by(!!! ensyms(...)) %>%
summarise(mean = mean({{col_value}}), .groups = "drop")
}
-testing
> group_mean(df, value, cat, loc)
# A tibble: 4 x 3
cat loc mean
<int> <chr> <dbl>
1 1 a 0.327
2 1 b -0.291
3 2 a -0.382
4 2 b -0.320
> group_mean(df, value, 'cat', 'loc')
# A tibble: 4 x 3
cat loc mean
<int> <chr> <dbl>
1 1 a 0.327
2 1 b -0.291
3 2 a -0.382
4 2 b -0.320
If we are already using ... as other arguments, then an option is
group_mean <- function(data, col_value, group) {
grp_lst <- as.list(substitute(group))
if(length(grp_lst)> 1) grp_lst <- grp_lst[-1]
grps <- purrr::map_chr(grp_lst, rlang::as_string)
data %>%
group_by(across(all_of(grps))) %>%
summarise(mean = mean({{col_value}}), .groups = "drop")
}
-testing
> group_mean(df, value, c(cat, loc))
# A tibble: 4 x 3
cat loc mean
<int> <chr> <dbl>
1 1 a 0.327
2 1 b -0.291
3 2 a -0.382
4 2 b -0.320

How to mutate a variable in a tibble based on columns named as numbers in R

I have a tibble with columns named as numbers (e.g. 1). I created a function to compute differences between columns, but I don't know how to do it with that type of columns:
<!-- language-all: lang-r -->
library(tidyverse)
df <- tibble(`1` = c(1,2,3), `2` = c(2,4,6))
# This works
df %>%
mutate(diff = `1` - `2`)
#> # A tibble: 3 x 3
#> `1` `2` diff
#> <dbl> <dbl> <dbl>
#> 1 1 2 -1
#> 2 2 4 -2
#> 3 3 6 -3
# But this doesn't
calc_diffs <- function(x, y){
df %>%
mutate(diff := !!x - !!y)
}
calc_diffs(1, 2)
#> # A tibble: 3 x 3
#> `1` `2` diff
#> <dbl> <dbl> <dbl>
#> 1 1 2 -1
#> 2 2 4 -1
#> 3 3 6 -1
<sup>Created on 2020-10-14 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
We can convert to a symbol and evaluate
calc_diffs <- function(x, y){
df %>%
mutate(diff := !! rlang::sym(x) - !!rlang::sym(y))
}
Then, we just pass a string as argument
calc_diffs("1", "2")
# A tibble: 3 x 3
# `1` `2` diff
# <dbl> <dbl> <dbl>
#1 1 2 -1
#2 2 4 -2
#3 3 6 -3
Column names are strings. We could pass index to subset the column, but here the column name is an unusual name that starts with number. So, either we can wrap it with backreference using paste or just pass a string, convert to symbol and evaluate (!!)
Does this work:
> df <- tibble(`1` = c(1,2,3), `2` = c(2,4,6))
> df
# A tibble: 3 x 2
`1` `2`
<dbl> <dbl>
1 1 2
2 2 4
3 3 6
> calc_diffs <- function(x, y){
+ df %>%
+ mutate(diff = {{x}} - {{y}})
+ }
> calc_diffs(`1`,`2`)
# A tibble: 3 x 3
`1` `2` diff
<dbl> <dbl> <dbl>
1 1 2 -1
2 2 4 -2
3 3 6 -3
>

What is the best way to tidily append many columns?

I'm looking to append 30 columns which give values for gamma distributions by using the tidyverse. Here's an example of the data:
data.frame('rank'=1:3,'shape'=c(16,0.2,4),'rate'=c(13,0.4,0.2))
I'd like to use dgamma(1:30,shape,rate) to append 30 columns to the existing dataframe.
You can use map2() in purrr and unnest_wider() in tidyr.
library(tidyverse)
df %>%
mutate(density = map2(shape, rate, dgamma, x = 1:30)) %>%
unnest_wider(density, names_sep = "_")
Or use rowwise() at first and then mutate() with list().
df %>%
rowwise() %>%
mutate(density = list(dgamma(1:30, shape, rate))) %>%
unnest_wider(density, names_sep = "_")
Both of them give
# # A tibble: 3 x 33
# rank shape rate density_1 density_2 density_3 density_4 density_5 density_6 density_7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 16 13 1.15 0.0852 0.0000843 1.43e-8 9.16e-13 3.19e-17 7.28e-22
# 2 2 0.2 0.4 0.122 0.0468 0.0227 1.21e-2 6.77e- 3 3.92e- 3 2.32e- 3
# 3 3 4 0.2 0.000218 0.00143 0.00395 7.67e-3 1.23e- 2 1.73e- 2 2.26e- 2
# # … with 23 more variables: density_8 <dbl>, density_9 <dbl>, density_10 <dbl>, ..., density_30 <dbl>

left_join on a nested list

I have a nested df x and an unnested df y.
How can I join these two together so that the final output is a a single row with the id and val columns from x and a new column for each of the respective num values in order of appearance, num_1, num_2 ...?
library(tidyverse)
x <- tibble(id = list(letters[1:6]), val = 13)
x
#> # A tibble: 1 x 2
#> id val
#> <list> <dbl>
#> 1 <chr [6]> 13
y <- tibble(id = letters[1:6], num = rnorm(6))
y
#> # A tibble: 6 x 2
#> id num
#> <chr> <dbl>
#> 1 a 0.532
#> 2 b -0.106
#> 3 c -0.105
#> 4 d 0.973
#> 5 e -0.825
#> 6 f -0.951
map2(x, y, left_join, by = 'id')
Error in UseMethod("left_join"): no applicable method for 'left_join' applied to an object of class "list"
Created on 2020-08-14 by the reprex package (v0.3.0)
Edit: I'm looking for something loosely like this while still maintaining the ID column.
x %>%
unnest(id) %>%
left_join(y) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = -id,
values_from = num,
names_from = n)
#> Joining, by = "id"
#> # A tibble: 1 x 7
#> val `1` `2` `3` `4` `5` `6`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 13 1.28 -0.387 -0.438 -0.0826 0.164 -1.24
Continuing with your work, you can try the following.
library(dplyr)
library(tidyr)
x %>%
unnest(id) %>%
left_join(y, by = "id") %>%
mutate(name = row_number(), id = list(id)) %>%
pivot_wider(values_from = num, names_glue = "num_{name}")
# # A tibble: 1 x 8
# id val num_1 num_2 num_3 num_4 num_5 num_6
# <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 <chr [6]> 13 1.28 -0.387 -0.438 -0.0826 0.164 -1.24
or
x %>%
mutate(num = map(id, ~ tibble::deframe(y) %>% .[match(names(.), .x)] %>% unname)) %>%
unnest_wider(num, names_sep = "_")
# # A tibble: 1 x 8
# id val num_1 num_2 num_3 num_4 num_5 num_6
# <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 <chr [6]> 13 1.28 -0.387 -0.438 -0.0826 0.164 -1.24
Description of the second solution
deframe() in tibble transforms a two-column data.frame to a named vector, the first column is converted to vector names and the second one is converted to vector values. deframe(y) %>% .[match(names(.), .x)] is equivalent to deframe(y)[match(names(deframe(y)), .x)]. The deframe(y) part appears twice, so I move it to the front of a pipe and use the . symbol to represent it behind the pipe. This line is to match the position of id columns of both data and reorder num column of y.
Based on your y you're not going to have multiple columns but adjusting the example a little, is this what you were aiming for?
x <- tibble(id = list(letters[1:6]), val = 13)
y <- tibble(id = rep(letters[1:6],2), num = rnorm(12),
name = paste0("num_", rep(1:2, each = 6)))
map_dfr(x$id[[1]], ~tibble(id = .x, val = x$val)) %>%
left_join(
pivot_wider(y, names_from = name, values_from = num)
)
#> Joining, by = "id"
#> # A tibble: 6 x 4
#> id val num_1 num_2
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 13 0.609 1.97
#> 2 b 13 0.956 -1.84
#> 3 c 13 0.425 0.297
#> 4 d 13 0.0379 -0.784
#> 5 e 13 -0.532 -0.769
#> 6 f 13 0.538 -1.10

weighted mean in dplyr for multiple columns

I'm trying to calculate the weighted mean for multiple columns using dplyr. at the moment I'm stuck with summarize_each which to me seems to be part of the solution. here's some example code:
library(dplyr)
f2a <- c(1,0,0,1)
f2b <- c(0,0,0,1)
f2c <- c(1,1,1,1)
clustervar <- c("A","B","B","A")
weight <- c(10,20,30,40)
df <- data.frame (f2a, f2b, f2c, clustervar, weight, stringsAsFactors=FALSE)
df
what I am looking for is something like
df %>%
group_by (clustervar) %>%
summarise_each(funs(weighted.mean(weight)), select=cbind(clustervar, f2a:f2c))
The result of this is only:
# A tibble: 2 × 4
clustervar select4 select5 select6
<chr> <dbl> <dbl> <dbl>
1 A 25 25 25
2 B 25 25 25
What am I missing here?
You can use summarise_at to specify which columns you want to operate on:
df %>% group_by(clustervar) %>%
summarise_at(vars(starts_with('f2')),
funs(weighted.mean(., weight)))
#> # A tibble: 2 × 4
#> clustervar f2a f2b f2c
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 1 0.8 1
#> 2 B 0 0.0 1
We can reshape it to 'long' format and then do this
library(tidyverse)
gather(df, Var, Val, f2a:f2c) %>%
group_by(clustervar, Var) %>%
summarise(wt =weighted.mean(Val, weight)) %>%
spread(Var, wt)
Or another option is
df %>%
group_by(clustervar) %>%
summarise_each(funs(weighted.mean(., weight)), matches("^f"))
# A tibble: 2 × 4
# clustervar f2a f2b f2c
# <chr> <dbl> <dbl> <dbl>
# 1 A 1 0.8 1
# 2 B 0 0.0 1
Or with summarise_at and matches (another variation of another post - didn't see the other post while posting)
df %>%
group_by(clustervar) %>%
summarise_at(vars(matches('f2')), funs(weighted.mean(., weight)))
# A tibble: 2 × 4
# clustervar f2a f2b f2c
# <chr> <dbl> <dbl> <dbl>
#1 A 1 0.8 1
#2 B 0 0.0 1
Or another option is data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) weighted.mean(x, weight)),
by = clustervar, .SDcols = f2a:f2c]
# clustervar f2a f2b f2c
#1: A 1 0.8 1
#2: B 0 0.0 1
NOTE: All four answers are based on legitimate tidyverse/data.table syntax and would get the expected output
We can also create a function that makes use of the syntax from devel version of dplyr (soon to be released 0.6.0). The enquo does the similar job of substitute by taking the input arguments and converting it to quosures. Within the group_by/summarise/mutate, we evalute the quosure by unquoting (UQ or !!) it
wtFun <- function(dat, pat, wtcol, grpcol){
wtcol <- enquo(wtcol)
grpcol <- enquo(grpcol)
dat %>%
group_by(!!grpcol) %>%
summarise_at(vars(matches(pat)), funs(weighted.mean(., !!wtcol)))
}
wtFun(df, "f2", weight, clustervar)
# A tibble: 2 × 4
# clustervar f2a f2b f2c
# <chr> <dbl> <dbl> <dbl>
#1 A 1 0.8 1
#2 B 0 0.0 1

Resources