R: Apply function over nested lists with varying length - r

I have the following dataframe:
df <- data.frame(id = paste0('id', sample(c(1:4),80000, replace = TRUE)), date = as.Date(rbeta(80000, 0.7, 10) * 100, origin = "2016-01-01"),
variant = sample(c(0:1), 80000, replace = TRUE), type = sample(paste0(LETTERS[1:3],LETTERS[1]), 80000, TRUE), code = sample(letters[1:2], 80000, TRUE),
level = sample(LETTERS[1:8], 80000, TRUE), number = sample(c(1:100), 80000, replace = TRUE) )
Next, I split the dataframe several times and combine them (plus the original df) in a list:
dfs <- split(df,df$id)
df2 <- lapply(dfs, function(x) split(x,x$type))
df3 <- lapply(dfs, function(x) split(x,x$code))
df4 <- lapply(dfs, function(x) split(x,x$level))
df_all <- list(dfs,df2,df3,df4)
Thus, I first split the dataframe by Id, after which their are splitted on several conditions: none,type,code and level. Where "none" means that I don't split it any further.
My first question: is there a faster/cleaner way to achieve this?
Second question: how do I apply a function to each element of this list? It will probably will have something to do with lapply, but I can't figure out how, as the number of nested lists varies. Thus, to make it more clear, I would like to know how to apply my function to:
df_all[[1]]$id1
df_all[[1]]$id2
df_all[[1]]$id3
df_all[[1]]$id4
df_all[[2]]$id1$AA
df_all[[2]]$id1$BA
df_all[[2]]$id1$CA
df_all[[2]]$id2$AA
etc.
My function is as follows:
func <- function(x){
x <- x %>%
group_by(variant) %>%
summarise(H = sum(number)) %>%
ungroup()

If all you wanted to do is group by different combination of variables and summarize, then splitting the groups is probably not a good idea, just modify the function so that you can input different combinations of group by variables like the following:
library(dplyr)
func2 <- function(x, ...){
group_quo = quos(...)
x %>%
group_by(!!!group_quo) %>%
summarize(H = sum(number))
}
Result:
> func2(df, id, variant)
# A tibble: 8 x 3
# Groups: id [?]
id variant H
<fct> <int> <int>
1 id1 0 500192
2 id1 1 508282
3 id2 0 505829
4 id2 1 511855
5 id3 0 502280
6 id3 1 510854
7 id4 0 502621
8 id4 1 510372
> func2(df, id, type, variant)
# A tibble: 24 x 4
# Groups: id, type [?]
id type variant H
<fct> <fct> <int> <int>
1 id1 AA 0 167757
2 id1 AA 1 169025
3 id1 BA 0 166225
4 id1 BA 1 168208
5 id1 CA 0 166210
6 id1 CA 1 171049
7 id2 AA 0 169277
8 id2 AA 1 172240
9 id2 BA 0 168596
10 id2 BA 1 169396
# ... with 14 more rows
etc.
If you're trying to apply something more complex or you want to keep the hierarchical structure of the lists, you can try to use nested data.frames:
library(dplyr)
library(tidyr)
library(purrr)
func <- function(x){
x %>%
group_by(variant) %>%
summarize(H = sum(number))
}
df_nested = df %>%
group_by(id) %>%
nest() %>%
mutate(df1 = data %>% map(func),
df2 = data %>% map(~group_by(., type) %>% nest()),
df3 = data %>% map(~group_by(., code) %>% nest()),
df4 = data %>% map(~group_by(., level) %>% nest())) %>%
mutate_at(vars(df2:df4),
funs(map(., function(x) mutate(x, data = map(data, func)) %>% unnest)))
Result:
> df_nested
# A tibble: 4 x 6
id data df1 df2 df3 df4
<fct> <list> <list> <list> <list> <list>
1 id1 <tibble [19,963 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
2 id3 <tibble [19,946 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
3 id2 <tibble [20,114 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
4 id4 <tibble [19,977 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
> df_nested %>%
+ select(id, data) %>%
+ unnest()
# A tibble: 80,000 x 7
id date variant type code level number
<fct> <date> <int> <fct> <fct> <fct> <int>
1 id1 2016-01-05 1 AA b H 71
2 id1 2016-01-01 0 CA a G 85
3 id1 2016-01-03 0 CA a E 98
4 id1 2016-01-01 1 BA b E 78
5 id1 2016-01-01 1 BA b G 64
6 id1 2016-01-18 1 AA a E 69
7 id1 2016-01-04 1 BA b E 12
8 id1 2016-01-02 0 CA b B 32
9 id1 2016-01-01 1 CA a B 44
10 id1 2016-01-02 0 BA a F 89
# ... with 79,990 more rows
> df_nested %>%
+ select(id, df1) %>%
+ unnest()
# A tibble: 8 x 3
id variant H
<fct> <int> <int>
1 id1 0 500192
2 id1 1 508282
3 id3 0 502280
4 id3 1 510854
5 id2 0 505829
6 id2 1 511855
7 id4 0 502621
8 id4 1 510372

Related

Nest a tibble by column prefix

We do a normal nesting grouping by rows. Mine is different.
I want to create a nested tibble grouping by column prefixes (before the first '_'), preserving the original column names in the nested tibbles.
The current approach works but looks overcomplicated.
tibble(a_1=1:3, a_2=2:4, b_1=3:5) %>%
print() %>%
# A tibble: 3 x 3
# a_1 a_2 b_1
# <int> <int> <int>
# 1 1 2 3
# 2 2 3 4
# 3 3 4 5
pivot_longer(everything()) %>%
nest(data=-name) %>%
mutate(data=map2(data, name, ~rename(.x, '{.y}' := value))) %>%
mutate(gr=str_extract(name, '^[^_]+'), .keep='unused') %>%
nest(data=-gr) %>%
mutate(data=map(data, ~bind_cols(.[[1]]))) %>%
print() %>%
# A tibble: 2 x 2
# gr data
# <chr> <list>
# 1 a <tibble [3 x 2]>
# 2 b <tibble [3 x 1]>
{ .$data[[1]] }
# A tibble: 3 x 2
# a_1 a_2
# <int> <int>
# 1 1 2
# 2 2 3
# 3 3 4
UPD: if possible, tidyverse solution
Using a neat little trick I learned lately you could do:
library(tidyr)
library(dplyr, warn = FALSE)
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(., gsub("_[0-9]", "", names(.))) %>%
lapply(nest, data = everything()) %>%
bind_rows(.id = "gr")
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
Another possible solution, based on purrr::map_dfr:
library(tidyverse)
map_dfr(unique(str_remove(names(df), "_\\d+")),
~ tibble(gr = .x, nest(select(df, which(str_detect(names(df), .x))),
data = everything())))
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
my version, a little more modified, tidyversed version of stepan's answer
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(str_extract(names(.), "^[^_]+")) %>%
map(nest, data = everything()) %>%
bind_rows(.id = "gr")
Couldn't find an alternative to split.default()

mutate_if, across or map_if where want to process the iteration conditional on value in another field

Simple example:
mydf <- data.frame(
x = 1:3,
y = c(1, 0, 1),
z = 1:3
) %>% group_by(x) %>% nest
mydf %>% mutate(blah = map_dbl(.x = data, ~ .x$z * 2))
Returns:
# A tibble: 3 x 3
# Groups: x [3]
x data blah
<int> <list> <dbl>
1 1 <tibble [1 × 2]> 2
2 2 <tibble [1 × 2]> 4
3 3 <tibble [1 × 2]> 6
I would like to mutate or map conditional on y. If y=1, then process with .x * 2 else (y = 0) then just use NA.
Desired result:
# A tibble: 3 x 3
# Groups: x [3]
x data blah
<int> <list> <dbl>
1 1 <tibble [1 × 2]> 2
2 2 <tibble [1 × 2]> NA
3 3 <tibble [1 × 2]> 6
Should I use mutate_if, mutate_across, map_if? How can I get this result?
In case the OP needs to retain the map model in their real use case, map2() is one possibility...
mydf %>% mutate(blah = map2(x, y, ~ifelse(.y == 1, .x * 2, NA)))
x y blah
1 1 1 2
2 2 0 NA
3 3 1 6

turn time series data into nested data frame where groups are increasing time periods (w/dplyr & tidyr)

I have a dataset with daily counts per year spanning several decades, and I'd like to run a function on different subsets of that data based on an increasing timespan. For example, I'd like to run the function on the first decade of data (1995-2005), then on the first decade + 1 (1995-2006), first decade + 2 (1995-2007), and so on until the end of the time series. This is what I had in mind:
dat <- tibble(
year = rep(1995:2014, each = 30),
count = rpois(600, 5)
)
dat
# A tibble: 600 x 2
year count
<int> <int>
1 1995 8
2 1995 3
3 1995 9
4 1995 2
5 1995 8
6 1995 7
7 1995 3
8 1995 6
9 1995 1
10 1995 7
# … with 590 more rows
with the final product looking like this:
# A tibble: 3 x 2
time_span data
<chr> <list>
1 1995-2004 <tibble [300 × 1]>
2 1995-2005 <tibble [330 × 1]>
3 1995-2006 <tibble [360 × 1]>
...
I would then apply my function to the nested data frame:
dat_nested %>%
mutate(result = map(data, my_function))
I'm struggling to think of a way to create these subsets with dplyr...any suggestions? Thanks!
Here's a way using map :
library(dplyr)
n <- min(dat$year)
purrr::map_df((n+10):max(dat$year),
~dat %>%
filter(between(year, n, .x)) %>%
summarise(year = paste(min(year), max(year), sep = '-'),
data = list(count)))
#If you want dataframe
#data = list(data.frame(count = count))))
# year data
# <chr> <list>
# 1 1995-2005 <int [330]>
# 2 1995-2006 <int [360]>
# 3 1995-2007 <int [390]>
# 4 1995-2008 <int [420]>
# 5 1995-2009 <int [450]>
# 6 1995-2010 <int [480]>
# 7 1995-2011 <int [510]>
# 8 1995-2012 <int [540]>
# 9 1995-2013 <int [570]>
#10 1995-2014 <int [600]>
The result could be directly calculated from the original data frame without the need of an intermediate nested data frame and we show that below; however, if you do want to create a nested data frame anyways then use the same code but use it with
my_function <- base::list
to nest the two columns or with
my_function <- function(x) list(x["count"])
to just nest the count column. The solution only uses dplyr. It does not use tidyr or purrr.
library(dplyr)
my_function <- function(x) sum(x$count) # test function
dat %>%
group_by(year) %>%
summarize(result = my_function(.[.$year <= first(year), ]), .groups = "drop") %>%
mutate(year = paste(first(year), year, sep = "-")) %>%
tail(-9)
giving:
# A tibble: 11 x 2
year result
<chr> <int>
1 1995-2004 1502
2 1995-2005 1647
3 1995-2006 1810
4 1995-2007 1957
5 1995-2008 2106
6 1995-2009 2258
7 1995-2010 2398
8 1995-2011 2547
9 1995-2012 2697
10 1995-2013 2855
11 1995-2014 3016
With my_function <- function(x) list(x["count"]) the output looks like this:
# A tibble: 11 x 2
year result
<chr> <list>
1 1995-2004 <tibble [300 x 1]>
2 1995-2005 <tibble [330 x 1]>
3 1995-2006 <tibble [360 x 1]>
4 1995-2007 <tibble [390 x 1]>
5 1995-2008 <tibble [420 x 1]>
6 1995-2009 <tibble [450 x 1]>
7 1995-2010 <tibble [480 x 1]>
8 1995-2011 <tibble [510 x 1]>
9 1995-2012 <tibble [540 x 1]>
10 1995-2013 <tibble [570 x 1]>
11 1995-2014 <tibble [600 x 1]>
Note
The test input dat in reproducible form is:
set.seed(123)
dat <- data.frame(year = rep(1995:2014, each = 30), count = rpois(600, 5))
Here is my attempt to create a nested data with time-series data on a rolling window basis. (note: rlang usage of var=enquo(str_varname) with !!var may change in the future versions.)
library(dplyr)
library(tidyr)
create_rolling_yr_data <- function(df, year='year', rolling=9,
var_list=c('count'), newvar='rolling') {
year <- enquo(year)
var_list <- enquos(var_list)
df <- df %>% dplyr::select(!!year, !!!var_list)
df_nest <- df %>% group_by(year) %>% nest()
print(df_nest)
list_data <- list()
yrs <- unique(df[[ensym(year)]])
yr_end <- max(yrs) - rolling
for (i in seq_along(yrs)) {
yr <- yrs[i]
if (yr <= yr_end) {
list_data[[i]] <- df %>% filter(year >= yr, year <= (yr+rolling))
} else {
list_data[[i]] <- list()
}
}
df_nest[[newvar]] <- list_data
return(df_nest %>% filter(year <= yr_end))
}
create_rolling_yr_data(dat, year='year', rolling=9,
var_list=c('count'), newvar='rolling')

R collapse column to form numeric list

In R hoe do I collapse column to form another column with numeric lists types.
like we define numeric list as l = c(1,2,3)
df <- read.table(text = "X Y
a 26
a 3
a 24
b 8
b 1
b 4
", header = TRUE)
I am trying this with dplyr but it gives me character list column
> df %>% group_by(X) %>% summarise(lst= paste0(Y, collapse = ","))
# A tibble: 2 x 2
X lst
<fct> <chr>
1 a 26,3,24
2 b 8,1,4
group by X then summarise Y as list
library(dplyr)
out <- df %>%
group_by(X) %>%
summarise(Y = list(Y))
out
# A tibble: 2 x 2
# X Y
# <fct> <list>
#1 a <int [3]>
#2 b <int [3]>
The Y column now looks like this
out$Y
#[[1]]
#[1] 26 3 24
#
#[[2]]
#[1] 8 1 4
nest seems to be another option but this would result in a list column of tibbles (not what you want I think)
df %>%
group_by(X) %>%
nest()
# A tibble: 2 x 2
# X data
# <fct> <list>
#1 a <tibble [3 × 1]>
#2 b <tibble [3 × 1]>
A data.table solution:
library(data.table)
dt <- as.data.table(df)[, list(Y=list(Y)), by="X"]
> dt
X Y
1: a 26, 3,24
2: b 8,1,4
> dt$Y
[[1]]
[1] 26 3 24
[[2]]
[1] 8 1 4

Creating tibble or data frame of tibbles or data frames and other class

Is it possible to create a tibble or data.frame, which has columns that are integers and other columns that are tibbles or data.frames?
E.g.:
library(tibble)
set.seed(1)
df.1 <- tibble(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
df.2 <- tibble(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
And then:
df <- tibble(id=1,rank=2,data=df.1)
which gives this error:
Error: Column `data` must be a 1d atomic vector or a list
I guess df.1 has to be a list for this to work?
Is this what you are looking for? I think the key is the length of each column should be the same, and we need to use list to create a list column to store df.1 and df.2.
df <- tibble(id = 1:2,
rank = 2,
data = list(df.1, df.2))
df
# # A tibble: 2 x 3
# id rank data
# <int> <dbl> <list>
# 1 1 2 <tibble [20 x 2]>
# 2 2 2 <tibble [20 x 2]>
head(df$data[[1]])
# # A tibble: 6 x 2
# name score
# <chr> <int>
# 1 G 94
# 2 J 22
# 3 N 64
# 4 U 13
# 5 E 26
# 6 S 37
head(df$data[[2]])
# # A tibble: 6 x 2
# name score
# <chr> <int>
# 1 V 92
# 2 Q 30
# 3 S 45
# 4 M 33
# 5 L 63
# 6 Y 25
And since the structure of each tibble in the data column are the same. We can use tidyr::unnest to expand the tibble.
library(tidyr)
df_un <- unnest(df)
# # A tibble: 40 x 4
# id rank name score
# <int> <dbl> <chr> <int>
# 1 1 2 G 94
# 2 1 2 J 22
# 3 1 2 N 64
# 4 1 2 U 13
# 5 1 2 E 26
# 6 1 2 S 37
# 7 1 2 W 2
# 8 1 2 M 36
# 9 1 2 L 81
# 10 1 2 B 31
# # ... with 30 more rows
And we can also nest the tibble, making it back to the original format with a list column.
library(dplyr)
df_n <- df_un %>%
group_by(id, rank) %>%
nest() %>%
ungroup()
df_n
# # A tibble: 2 x 3
# id rank data
# <int> <dbl> <list>
# 1 1 2 <tibble [20 x 2]>
# 2 2 2 <tibble [20 x 2]>
# Check if df and df_n are the same
identical(df_n, df)
# [1] TRUE
Using tidyr's nest:
set.seed(1)
df.1 <- data.frame(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
df.2 <- data.frame(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
I can create a tibble where df.1 is nested under id and rank:
library(dplyr)
library(tidyr)
data.frame(id=1,rank=2,data=df.1) %>% nest(-id,-rank)
# A tibble: 1 × 3
id rank data
<dbl> <dbl> <list>
1 1 2 <tibble [20 × 2]>
For having both df.1 and df.2 in a tibble, I'd simply do:
data.frame(id=c(1,2),rank=c(2,1),data=c(df.1,df.2)) %>% nest(-id,-rank)
# A tibble: 2 × 3
id rank data
<dbl> <dbl> <list>
1 1 2 <tibble [10 × 4]>
2 2 1 <tibble [10 × 4]>

Resources