Element-wise median across lists of data frames in R - r

I have several lists, each one containing many data frames. I would like to calculate the element-wise median across the elements of each data frame inside each list, i.e. the median between element [[1]][1,1] of list1, element [[1]][1,1] of list2, and element [[1]][1,1] of list3, and so on for all elements. The data frames have many columns each, but here is some sample data with only two columns:
set.seed(1)
list1 <- list(data.frame(a = sample.int(10, 4), b = sample.int(10, 4)),
data.frame(a = sample.int(10, 4), b = sample.int(10, 4)))
list2 <- list(data.frame(a = sample.int(10, 4), b = sample.int(10, 4)),
data.frame(a = sample.int(10, 4), b = sample.int(10, 4)))
list3 <- list(data.frame(a = sample.int(10, 4), b = sample.int(10, 4)),
data.frame(a = sample.int(10, 4), b = sample.int(10, 4)))
This is the expected result:
[[1]]
a b
7 4
9 9
7 3
4 6
[[2]]
a b
5 7
8 6
2 6
5 2
Any tips?

Using purrr:
library(purrr)
lsts <- list(list1,list2,list3)
map(transpose(lsts),~map_dfc(transpose(.), ~apply(list2DF(.x),1,median)))
[[1]]
# A tibble: 4 × 2
a b
<int> <int>
1 7 4
2 9 9
3 7 3
4 4 6
[[2]]
# A tibble: 4 × 2
a b
<int> <int>
1 5 7
2 8 6
3 2 6
4 5 2
In Base R, assuming they all have the same structure:
dims <- c(dim(list1[[1]]), length(list1), length(lsts))
d <- apply(array(unlist(lsts), dims), head(seq(dims),-1), median)
asplit(aperm(d, c(1,3,2)), 3)
[[1]]
[,1] [,2]
[1,] 7 4
[2,] 9 9
[3,] 7 3
[4,] 4 6
[[2]]
[,1] [,2]
[1,] 5 7
[2,] 8 6
[3,] 2 6
[4,] 5 2

Here's another (base R) option:
Map(function(...) {
dots1 <- list(...)
out <- do.call(mapply, c(list(FUN=function(...) {
dots2 <- list(...)
apply(do.call(cbind, dots2), 1, median)
}), dots1))
data.frame(out)
}, list1, list2, list3)
# [[1]]
# a b
# 1 7 4
# 2 9 9
# 3 7 3
# 4 4 6
# [[2]]
# a b
# 1 5 7
# 2 8 6
# 3 2 6
# 4 5 2
Certainly not beautiful, but functional.
A related dplyr option:
list(list1, list2, list3) |>
lapply(bind_rows, .id = "id1") |>
bind_rows(.id = "id2") |>
group_by(id1, id2) |>
mutate(rn = row_number()) |>
group_by(id1, rn) |>
summarize(across(c(a, b), ~ median(.))) |>
ungroup() |>
select(-rn) |>
group_nest(id1) |>
pull(data)
# [[1]]
# # A tibble: 4 × 2
# a b
# <int> <int>
# 1 7 4
# 2 9 9
# 3 7 3
# 4 4 6
# [[2]]
# # A tibble: 4 × 2
# a b
# <int> <int>
# 1 5 7
# 2 8 6
# 3 2 6
# 4 5 2

Certainly not the most efficient solution, but one option with tidyverse might be:
map_dfr(mget(ls(pattern = "list")),
function(list_of_lists) imap(list_of_lists,
function(lists, lists_id)
lists %>%
mutate(rowid = row_number(),
lists_id = lists_id))) %>%
group_by(rowid, lists_id) %>%
summarise(across(c(a, b), median))
rowid lists_id a b
<int> <int> <int> <int>
1 1 1 10 3
2 1 2 8 1
3 2 1 5 4
4 2 2 9 8
5 3 1 6 6
6 3 2 6 3
7 4 1 3 2
8 4 2 4 6
If the goal is to return a list:
map_dfr(mget(ls(pattern = "list")),
function(list_of_lists) imap(list_of_lists,
function(lists, lists_id)
lists %>%
mutate(rowid = row_number(),
lists_id = lists_id))) %>%
group_by(rowid, lists_id) %>%
summarise(across(c(a, b), median)) %>%
ungroup() %>%
group_split(lists_id)
[[1]]
# A tibble: 4 × 4
rowid lists_id a b
<int> <int> <int> <int>
1 1 1 10 3
2 2 1 5 4
3 3 1 6 6
4 4 1 3 2
[[2]]
# A tibble: 4 × 4
rowid lists_id a b
<int> <int> <int> <int>
1 1 2 8 1
2 2 2 9 8
3 3 2 6 3
4 4 2 4 6

Here is a tidyverse solution first draft (I am sure that it could be improved):
library(tidyverse)
bind_rows(list1, list2, list3) %>%
mutate(x =rep(1:3, each=8, length.out = n())) %>%
group_by(x) %>%
pivot_wider(names_from = x,
values_from = c(a,b),
values_fn = list) %>%
unnest() %>%
rowwise() %>%
transmute(a = median(c(a_1, a_2, a_3)),
b = median(c(b_1, b_2, b_3))
) %>%
ungroup() %>%
group_by(x = as.integer(gl(n(),4,n()))) %>%
group_split() %>%
map(.,~(.x %>%select(-x)))
[[1]]
# A tibble: 4 × 2
a b
<int> <int>
1 7 4
2 9 9
3 7 3
4 4 6
[[2]]
# A tibble: 4 × 2
a b
<int> <int>
1 5 7
2 8 6
3 2 6
4 5 2

Related

Is there a way to get subdataframes with purrr in magrittr pipes workflow without using data.frame name?

That is, I was interested in doing the same as in the example, but with purrr functions.
tibble(a, b = a * 2, c = 1) %>%
{lapply(X = names(.), FUN = function(.x) select(., 1:.x))}
[[1]]
# A tibble: 5 x 1
a
<int>
1 1
2 2
3 3
4 4
5 5
[[2]]
# A tibble: 5 x 2
a b
<int> <dbl>
1 1 2
2 2 4
3 3 6
4 4 8
5 5 10
[[3]]
# A tibble: 5 x 3
a b c
<int> <dbl> <dbl>
1 1 2 1
2 2 4 1
3 3 6 1
4 4 8 1
5 5 10 1
I only could do it if I named foo <- tibble(a, b = a * 2, c = 1) and inside map I did select(foo, ...), but I wanted to avoid that, since I wanted to mutate the named dataframe in pipe workflow.
Thank you!
You can use map in the following way :
library(dplyr)
library(purrr)
tibble(a = 1:5, b = a * 2, c = 1) %>%
{map(names(.), function(.x) select(., 1:.x))}
Based on your actual use case you can also use imap which will pass column value (.x) along with it's name (.y).
tibble(a = 1:5, b = a * 2, c = 1) %>%
imap(function(.x, .y) select(., 1:.y))
#$a
# A tibble: 5 x 1
# a
# <int>
#1 1
#2 2
#3 3
#4 4
#5 5
#$b
# A tibble: 5 x 2
# a b
# <int> <dbl>
#1 1 2
#2 2 4
#3 3 6
#4 4 8
#5 5 10
#$c
# A tibble: 5 x 3
# a b c
# <int> <dbl> <dbl>
#1 1 2 1
#2 2 4 1
#3 3 6 1
#4 4 8 1
#5 5 10 1

How to create combinations of values of one variable by group using tidyverse in R

I am using the combn function in R to get all the combinations of the values of variable y taking each time 2 values, grouping by the values of x. My expected final result is the tibble c.
But when I try to do it in tidyverse something is (very) wrong.
library(tidyverse)
df <- tibble(x = c(1, 1, 1, 2, 2, 2, 2),
y = c(8, 9, 7, 3, 5, 2, 1))
# This is what I want
a <- combn(df$y[df$x == 1], 2)
a <- rbind(a, rep(1, ncol(a)))
b <- combn(df$y[df$x == 2], 2)
b <- rbind(b, rep(2, ncol(b)))
c <- cbind(a, b)
c <- tibble(c)
c <- t(c)
# but using tidyverse it does not work
df %>% group_by(x) %>% mutate(z = combn(y, 2))
#> Error: Problem with `mutate()` input `z`.
#> x Input `z` can't be recycled to size 3.
#> i Input `z` is `combn(y, 2)`.
#> i Input `z` must be size 3 or 1, not 2.
#> i The error occurred in group 1: x = 1.
Created on 2020-11-18 by the reprex package (v0.3.0)
Try with combn
out = df %>% group_by(x) %>% do(data.frame(t(combn(.$y, 2))))
# A tibble: 9 x 3
# Groups: x [2]
x X1 X2
<dbl> <dbl> <dbl>
1 1 8 9
2 1 8 7
3 1 9 7
4 2 3 5
5 2 3 2
6 2 3 1
7 2 5 2
8 2 5 1
9 2 2 1
If you have dplyr v1.0.2, you can do this
df %>% group_by(x) %>% group_modify(~as_tibble(t(combn(.$y, 2L))))
Output
# A tibble: 9 x 3
# Groups: x [2]
x V1 V2
<dbl> <dbl> <dbl>
1 1 8 9
2 1 8 7
3 1 9 7
4 2 3 5
5 2 3 2
6 2 3 1
7 2 5 2
8 2 5 1
9 2 2 1
An option with summarise and unnest
library(dplyr)
library(tidyr)
df %>%
group_by(x) %>%
summarise(y = list(as.data.frame(t(combn(y, 2)))), .groups = 'drop') %>%
unnest(c(y))
# A tibble: 9 x 3
# x V1 V2
# <dbl> <dbl> <dbl>
#1 1 8 9
#2 1 8 7
#3 1 9 7
#4 2 3 5
#5 2 3 2
#6 2 3 1
#7 2 5 2
#8 2 5 1
#9 2 2 1

Unnest or unchop dataframe containing lists of different lengths

I have a dataframe with several columns containing list columns that I want to unnest (or unchop). BUT, they are different lengths, so the resulting error is Error: No common size for...
Here is a reprex to show what works and doesn't work.
library(tidyr)
library(vctrs)
# This works as expected
df_A <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9)))
)
unchop(df_A, cols = c(A))
# A tibble: 7 x 2
ID A
<int> <dbl>
1 1 9
2 1 8
3 1 5
4 2 7
5 2 6
6 3 6
7 3 9
# This works as expected as the lists are the same lengths
df_AB_1 <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9))),
B = as_list_of(list(c(1, 2, 3), c(4, 5), c(7, 8)))
)
unchop(df_AB_1, cols = c(A, B))
# A tibble: 7 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 3
4 2 7 4
5 2 6 5
6 3 6 7
7 3 9 8
# This does NOT work as the lists are different lengths
df_AB_2 <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9))),
B = as_list_of(list(c(1, 2), c(4, 5, 6), c(7, 8, 9, 0)))
)
unchop(df_AB_2, cols = c(A, B))
# Error: No common size for `A`, size 3, and `B`, size 2.
The output that I would like to achieve for df_AB_2 above is as follows where each list is unchopped and missing values are filled with NA:
# A tibble: 10 x 3
ID A B
<dbl> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
I have referenced this issue on Github and StackOverflow here.
Any ideas how to achieve the result above?
Versions
> packageVersion("tidyr")
[1] ‘1.0.0’
> packageVersion("vctrs")
[1] ‘0.2.0.9001’
Here is an idea via dplyr that you can generalise to as many columns as you want,
library(tidyverse)
df_AB_2 %>%
pivot_longer(c(A, B)) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest() %>%
filter(rowSums(is.na(.[-1])) != 2)
which gives,
# A tibble: 10 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
Defining a helper function to update the lengths of the element and proceeding with dplyr:
foo <- function(x, len_vec) {
lapply(
seq_len(length(x)),
function(i) {
length(x[[i]]) <- len_vec[i]
x[[i]]
}
)
}
df_AB_2 %>%
mutate(maxl = pmax(lengths(A), lengths(B))) %>%
mutate(A = foo(A, maxl), B = foo(B, maxl)) %>%
unchop(cols = c(A, B)) %>%
select(-maxl)
# A tibble: 10 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
Using data.table:
library(data.table)
setDT(df_AB_2)
df_AB_2[, maxl := pmax(lengths(A), lengths(B))]
df_AB_2[, .(unlist(A)[seq_len(maxl)], unlist(B)[seq_len(maxl)]), by = ID]

How to use group_by with summarise and summarise_all?

x y
1 1 1
2 3 2
3 2 3
4 3 4
5 2 5
6 4 6
7 5 7
8 2 8
9 1 9
10 1 10
11 3 11
12 4 12
The above is part of the input.
Let's suppose that it also has a bunch of other columns
I want to:
group_by x
summarise y by sum
And for all other columns, I want to summarise_all by just taking the first value
Here's an approach that breaks it into two problems and combines them:
library(dplyr)
left_join(
# Here we want to treat column y specially
df %>%
group_by(x) %>%
summarize(sum_y = sum(y)),
# Here we exclude y and use a different summation for all the remaining columns
df %>%
group_by(x) %>%
select(-y) %>%
summarise_all(first)
)
# A tibble: 5 x 3
x sum_y z
<int> <int> <int>
1 1 20 1
2 2 16 3
3 3 17 2
4 4 18 2
5 5 7 3
Sample data:
df <- read.table(
header = T,
stringsAsFactors = F,
text="x y z
1 1 1
3 2 2
2 3 3
3 4 4
2 5 1
4 6 2
5 7 3
2 8 4
1 9 1
1 10 2
3 11 3
4 12 4")
library(dplyr)
df1 %>%
group_by(x) %>%
summarise_each(list(avg = mean), -y) %>%
bind_cols(.,{df1 %>%
group_by(x) %>%
summarise_at(vars(y), funs(sum)) %>%
select(-x)
})
#> # A tibble: 5 x 4
#> x r_avg r.1_avg y
#> <int> <dbl> <dbl> <int>
#> 1 1 6.67 6.67 20
#> 2 2 5.33 5.33 16
#> 3 3 5.67 5.67 17
#> 4 4 9 9 18
#> 5 5 7 7 7
Created on 2019-06-20 by the reprex package (v0.3.0)
Data:
df1 <- read.table(text="
r x y
1 1 1
2 3 2
3 2 3
4 3 4
5 2 5
6 4 6
7 5 7
8 2 8
9 1 9
10 1 10
11 3 11
12 4 12", header=T)
df1 <- df1[,c(2,3,1,1)]
library(tidyverse)
df <- tribble(~x, ~y, # making a sample data frame
1, 1,
3, 2,
2, 3,
3, 4,
2, 5,
4, 6,
5, 7,
2, 8,
1, 9,
1, 10,
3, 11,
4, 12)
df <- df %>%
add_column(z = sample(1:nrow(df))) #add another column for the example
df
# If there is only one additional column and you need the first value
df %>%
group_by(x) %>%
summarise(sum_y = sum(y), z_1st = z[1])
# otherwise use summarise_at to address all the other columns
f <- function(x){x[1]} # function to extract the first value
df %>%
group_by(x) %>%
summarise_at(.vars = vars(-c('y')), .funs = f) # exclude column y from the calculations

adding grouping indicator for repeating sequences

I thought this is simple thing but failed and can't find answer from anywhere.
Example data looks like this. I have nro running from 1:x and restarts at random points. I would like to create ind variable which would be 1 for first run and 2 for second...
tbl <- tibble(nro = c(rep(1:3, 1), rep(1:5, 1), rep(1:4, 1)))
End result should look like this:
tibble(nro = c(rep(1:3, 1), rep(1:5, 1), rep(1:4, 1)),
ind = c(rep(1, 3), rep(2, 5), rep(3, 4)))
# A tibble: 12 x 2
nro ind
<int> <dbl>
1 1 1
2 2 1
3 3 1
4 1 2
5 2 2
6 3 2
7 4 2
8 5 2
9 1 3
10 2 3
11 3 3
12 4 3
I thought I could do something with ifelse but failed miserably.
tbl %>%
mutate(ind = ifelse(nro < lag(nro), 1 + lag(ind), 1))
I assume this needs some kind of loop.
for sequences of the same length
You could use group_by on your nro variable and then just take the row_number():
tbl %>%
group_by(nro) %>%
mutate(ind = row_number())
# A tibble: 12 x 2
# Groups: nro [4]
# nro ind
# <int> <int>
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 1
# 5 1 2
# 6 2 2
# 7 3 2
# 8 4 2
# 9 1 3
# 10 2 3
# 11 3 3
# 12 4 3
for varying length of the sequences
inspired by docendo discimus's comment
tbl <- tibble(nro = c(rep(1:3, 1), rep(1:5, 1), rep(1:4, 1)))
tbl %>%
mutate(ind = cumsum(nro == 1))
However, this is limited to sequences which begin with 1, since only the TRUE values of nro == 1 are cumulated.
thus, you should consider to use this:
tbl %>% mutate(dif = nro - lag(nro)) %>%
mutate(dif = ifelse(is.na(dif), nro, dif)) %>%
mutate(ind = cumsum(dif < 0) + 1) %>%
select(-dif)
# A tibble: 12 x 2
# nro ind
# <int> <dbl>
# 1 1 1
# 2 2 1
# 3 3 1
# 4 1 2
# 5 2 2
# 6 3 2
# 7 4 2
# 8 5 2
# 9 1 3
# 10 2 3
# 11 3 3
# 12 4 3

Resources