R collapse column to form numeric list - r

In R hoe do I collapse column to form another column with numeric lists types.
like we define numeric list as l = c(1,2,3)
df <- read.table(text = "X Y
a 26
a 3
a 24
b 8
b 1
b 4
", header = TRUE)
I am trying this with dplyr but it gives me character list column
> df %>% group_by(X) %>% summarise(lst= paste0(Y, collapse = ","))
# A tibble: 2 x 2
X lst
<fct> <chr>
1 a 26,3,24
2 b 8,1,4

group by X then summarise Y as list
library(dplyr)
out <- df %>%
group_by(X) %>%
summarise(Y = list(Y))
out
# A tibble: 2 x 2
# X Y
# <fct> <list>
#1 a <int [3]>
#2 b <int [3]>
The Y column now looks like this
out$Y
#[[1]]
#[1] 26 3 24
#
#[[2]]
#[1] 8 1 4
nest seems to be another option but this would result in a list column of tibbles (not what you want I think)
df %>%
group_by(X) %>%
nest()
# A tibble: 2 x 2
# X data
# <fct> <list>
#1 a <tibble [3 × 1]>
#2 b <tibble [3 × 1]>

A data.table solution:
library(data.table)
dt <- as.data.table(df)[, list(Y=list(Y)), by="X"]
> dt
X Y
1: a 26, 3,24
2: b 8,1,4
> dt$Y
[[1]]
[1] 26 3 24
[[2]]
[1] 8 1 4

Related

Nest a tibble by column prefix

We do a normal nesting grouping by rows. Mine is different.
I want to create a nested tibble grouping by column prefixes (before the first '_'), preserving the original column names in the nested tibbles.
The current approach works but looks overcomplicated.
tibble(a_1=1:3, a_2=2:4, b_1=3:5) %>%
print() %>%
# A tibble: 3 x 3
# a_1 a_2 b_1
# <int> <int> <int>
# 1 1 2 3
# 2 2 3 4
# 3 3 4 5
pivot_longer(everything()) %>%
nest(data=-name) %>%
mutate(data=map2(data, name, ~rename(.x, '{.y}' := value))) %>%
mutate(gr=str_extract(name, '^[^_]+'), .keep='unused') %>%
nest(data=-gr) %>%
mutate(data=map(data, ~bind_cols(.[[1]]))) %>%
print() %>%
# A tibble: 2 x 2
# gr data
# <chr> <list>
# 1 a <tibble [3 x 2]>
# 2 b <tibble [3 x 1]>
{ .$data[[1]] }
# A tibble: 3 x 2
# a_1 a_2
# <int> <int>
# 1 1 2
# 2 2 3
# 3 3 4
UPD: if possible, tidyverse solution
Using a neat little trick I learned lately you could do:
library(tidyr)
library(dplyr, warn = FALSE)
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(., gsub("_[0-9]", "", names(.))) %>%
lapply(nest, data = everything()) %>%
bind_rows(.id = "gr")
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
Another possible solution, based on purrr::map_dfr:
library(tidyverse)
map_dfr(unique(str_remove(names(df), "_\\d+")),
~ tibble(gr = .x, nest(select(df, which(str_detect(names(df), .x))),
data = everything())))
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
my version, a little more modified, tidyversed version of stepan's answer
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(str_extract(names(.), "^[^_]+")) %>%
map(nest, data = everything()) %>%
bind_rows(.id = "gr")
Couldn't find an alternative to split.default()

How to extract first value from lists in data.frames columns?

This question is similar to R: How to extract a list from a dataframe?
But I could not implement it to my question in an easy way.
weird_df <- data_frame(col1 =c('hello', 'world', 'again'),col_weird = list(list(12,23), list(23,24), NA),col_weird2 = list(list(0,45), list(4,45),list(45,45.45,23)))
weird_df
# A tibble: 3 x 3
col1 col_weird col_weird2
<chr> <list> <list>
1 hello <list [2]> <list [2]>
2 world <list [2]> <list [2]>
3 again <lgl [1]> <list [3]>
>
I want in the columns col_weirdand col_weird2 to only display the first value of the current list.
col1 col_weird col_weird2
1 hello 12 0
2 world 23 4
3 again NA 45
My real problem has a lot of columns.I tried this (altered acceptend answer in posted link)
library(tidyr)
library(purrr)
weird_df %>%
mutate(col_weird = map(c(col_weird,col_weird2), toString ) ) %>%
separate(col_weird, into = c("col1"), convert = TRUE) %>%
separate(col_weird2, into = c("col2",convert = T)
One solution would be to write a simple function that extracts the first value from each list in a vector of lists . This you can then apply to the relevant columns in your data frame.
library(tibble)
#create data
weird_df <- tibble(col1 =c('hello', 'world', 'again'),
col_weird = list(list(12,23), list(23,24), NA),
col_weird2 = list(list(0,45), list(4,45), list(45,45.45,23)))
#function to extract first values from a vector of lists
fnc <- function(x) {
sapply(x, FUN = function(y) {y[[1]]})
}
#apply function to the relevant columns
weird_df[,2:3] <- apply(weird_df[,2:3], MARGIN = 2, FUN = fnc)
weird_df
# A tibble: 3 x 3
col1 col_weird col_weird2
<chr> <dbl> <dbl>
1 hello 12 0
2 world 23 4
3 again NA 45
Here is a dplyr solution
library(dplyr)
weird_df %>% mutate(across(c(col_weird, col_weird2), ~vapply(., `[[`, numeric(1L), 1L)))
Output
# A tibble: 3 x 3
col1 col_weird col_weird2
<chr> <dbl> <dbl>
1 hello 12 0
2 world 23 4
3 again NA 45

In R, a column of my dataframe is populated with other dataframes. I want to return a specific value as a new column in the original dataframe

I downloaded a data set from the web. It's got 6 columns, and the 6th column is filled with other dataframes. So, an example:
id homeTeam homeScore awayTeam away stats
401112436 Louisville 17 Notre Dame 35 <data.frame [4 × 4]>
401112114 Oklahoma 49 Houston 31 <data.frame [4 × 4]>
401114218 USC 31 Fresno State 23 <data.frame [4 × 4]>
I want to create a new column in the original dataframe with the value in row 1, column 2 of the "stats" dataframe for each row.
I added a row_id column with the row number, and tried
df$new_col <- df$stats[[df$row_id]][1,2]
but I'm getting a recursive error. When I hard code a number
df$stats[[1]][1,2]
it returns the correct number. I don't know why it wouldn't work with the row_id value just the same.
We can use pluck from purrr
library(dplyr)
library(purrr)
df %>% mutate(new_col = map_dbl(stats, pluck, 2, 1))
Using a reproducible example :
temp <- data.frame(a = 1:4, b = 2:5)
df <- tibble(a = 1:2, b = 6:7, c = list(temp, temp))
df %>% mutate(new_col = map_dbl(c, purrr::pluck, 2, 1))
# a b c new_col
# <int> <int> <list> <dbl>
#1 1 6 <df[,2] [4 × 2]> 2
#2 2 7 <df[,2] [4 × 2]> 2
With map, we loop over the 'stats' column, extract the second column, first element to create the 'new_col' in mutate and unnest the list element
library(purrr)
library(dplyr)
library(tidyr)
df <- df %>%
mutate(new_col = map(stats, ~ .x[[2]][1])) %>%
unnest(c(new_col))
df
# A tibble: 2 x 4
# a b stats new_col
# <int> <int> <list> <int>
#1 1 6 <df[,2] [4 × 2]> 2
#2 2 7 <df[,2] [4 × 2]> 2
If the column is character, use map_chr, if it is double, use map_dbl or if we don't know the type, then simply use map to return a list column and then unnest
Or in base R
df$new_col <- sapply(df$stats, function(x) x[[2]][1])
data
temp <- data.frame(a = 1:4, b = 2:5)
df <- tibble(a = 1:2, b = 6:7, stats = list(temp, temp))

R: Apply function over nested lists with varying length

I have the following dataframe:
df <- data.frame(id = paste0('id', sample(c(1:4),80000, replace = TRUE)), date = as.Date(rbeta(80000, 0.7, 10) * 100, origin = "2016-01-01"),
variant = sample(c(0:1), 80000, replace = TRUE), type = sample(paste0(LETTERS[1:3],LETTERS[1]), 80000, TRUE), code = sample(letters[1:2], 80000, TRUE),
level = sample(LETTERS[1:8], 80000, TRUE), number = sample(c(1:100), 80000, replace = TRUE) )
Next, I split the dataframe several times and combine them (plus the original df) in a list:
dfs <- split(df,df$id)
df2 <- lapply(dfs, function(x) split(x,x$type))
df3 <- lapply(dfs, function(x) split(x,x$code))
df4 <- lapply(dfs, function(x) split(x,x$level))
df_all <- list(dfs,df2,df3,df4)
Thus, I first split the dataframe by Id, after which their are splitted on several conditions: none,type,code and level. Where "none" means that I don't split it any further.
My first question: is there a faster/cleaner way to achieve this?
Second question: how do I apply a function to each element of this list? It will probably will have something to do with lapply, but I can't figure out how, as the number of nested lists varies. Thus, to make it more clear, I would like to know how to apply my function to:
df_all[[1]]$id1
df_all[[1]]$id2
df_all[[1]]$id3
df_all[[1]]$id4
df_all[[2]]$id1$AA
df_all[[2]]$id1$BA
df_all[[2]]$id1$CA
df_all[[2]]$id2$AA
etc.
My function is as follows:
func <- function(x){
x <- x %>%
group_by(variant) %>%
summarise(H = sum(number)) %>%
ungroup()
If all you wanted to do is group by different combination of variables and summarize, then splitting the groups is probably not a good idea, just modify the function so that you can input different combinations of group by variables like the following:
library(dplyr)
func2 <- function(x, ...){
group_quo = quos(...)
x %>%
group_by(!!!group_quo) %>%
summarize(H = sum(number))
}
Result:
> func2(df, id, variant)
# A tibble: 8 x 3
# Groups: id [?]
id variant H
<fct> <int> <int>
1 id1 0 500192
2 id1 1 508282
3 id2 0 505829
4 id2 1 511855
5 id3 0 502280
6 id3 1 510854
7 id4 0 502621
8 id4 1 510372
> func2(df, id, type, variant)
# A tibble: 24 x 4
# Groups: id, type [?]
id type variant H
<fct> <fct> <int> <int>
1 id1 AA 0 167757
2 id1 AA 1 169025
3 id1 BA 0 166225
4 id1 BA 1 168208
5 id1 CA 0 166210
6 id1 CA 1 171049
7 id2 AA 0 169277
8 id2 AA 1 172240
9 id2 BA 0 168596
10 id2 BA 1 169396
# ... with 14 more rows
etc.
If you're trying to apply something more complex or you want to keep the hierarchical structure of the lists, you can try to use nested data.frames:
library(dplyr)
library(tidyr)
library(purrr)
func <- function(x){
x %>%
group_by(variant) %>%
summarize(H = sum(number))
}
df_nested = df %>%
group_by(id) %>%
nest() %>%
mutate(df1 = data %>% map(func),
df2 = data %>% map(~group_by(., type) %>% nest()),
df3 = data %>% map(~group_by(., code) %>% nest()),
df4 = data %>% map(~group_by(., level) %>% nest())) %>%
mutate_at(vars(df2:df4),
funs(map(., function(x) mutate(x, data = map(data, func)) %>% unnest)))
Result:
> df_nested
# A tibble: 4 x 6
id data df1 df2 df3 df4
<fct> <list> <list> <list> <list> <list>
1 id1 <tibble [19,963 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
2 id3 <tibble [19,946 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
3 id2 <tibble [20,114 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
4 id4 <tibble [19,977 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
> df_nested %>%
+ select(id, data) %>%
+ unnest()
# A tibble: 80,000 x 7
id date variant type code level number
<fct> <date> <int> <fct> <fct> <fct> <int>
1 id1 2016-01-05 1 AA b H 71
2 id1 2016-01-01 0 CA a G 85
3 id1 2016-01-03 0 CA a E 98
4 id1 2016-01-01 1 BA b E 78
5 id1 2016-01-01 1 BA b G 64
6 id1 2016-01-18 1 AA a E 69
7 id1 2016-01-04 1 BA b E 12
8 id1 2016-01-02 0 CA b B 32
9 id1 2016-01-01 1 CA a B 44
10 id1 2016-01-02 0 BA a F 89
# ... with 79,990 more rows
> df_nested %>%
+ select(id, df1) %>%
+ unnest()
# A tibble: 8 x 3
id variant H
<fct> <int> <int>
1 id1 0 500192
2 id1 1 508282
3 id3 0 502280
4 id3 1 510854
5 id2 0 505829
6 id2 1 511855
7 id4 0 502621
8 id4 1 510372

Creating tibble or data frame of tibbles or data frames and other class

Is it possible to create a tibble or data.frame, which has columns that are integers and other columns that are tibbles or data.frames?
E.g.:
library(tibble)
set.seed(1)
df.1 <- tibble(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
df.2 <- tibble(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
And then:
df <- tibble(id=1,rank=2,data=df.1)
which gives this error:
Error: Column `data` must be a 1d atomic vector or a list
I guess df.1 has to be a list for this to work?
Is this what you are looking for? I think the key is the length of each column should be the same, and we need to use list to create a list column to store df.1 and df.2.
df <- tibble(id = 1:2,
rank = 2,
data = list(df.1, df.2))
df
# # A tibble: 2 x 3
# id rank data
# <int> <dbl> <list>
# 1 1 2 <tibble [20 x 2]>
# 2 2 2 <tibble [20 x 2]>
head(df$data[[1]])
# # A tibble: 6 x 2
# name score
# <chr> <int>
# 1 G 94
# 2 J 22
# 3 N 64
# 4 U 13
# 5 E 26
# 6 S 37
head(df$data[[2]])
# # A tibble: 6 x 2
# name score
# <chr> <int>
# 1 V 92
# 2 Q 30
# 3 S 45
# 4 M 33
# 5 L 63
# 6 Y 25
And since the structure of each tibble in the data column are the same. We can use tidyr::unnest to expand the tibble.
library(tidyr)
df_un <- unnest(df)
# # A tibble: 40 x 4
# id rank name score
# <int> <dbl> <chr> <int>
# 1 1 2 G 94
# 2 1 2 J 22
# 3 1 2 N 64
# 4 1 2 U 13
# 5 1 2 E 26
# 6 1 2 S 37
# 7 1 2 W 2
# 8 1 2 M 36
# 9 1 2 L 81
# 10 1 2 B 31
# # ... with 30 more rows
And we can also nest the tibble, making it back to the original format with a list column.
library(dplyr)
df_n <- df_un %>%
group_by(id, rank) %>%
nest() %>%
ungroup()
df_n
# # A tibble: 2 x 3
# id rank data
# <int> <dbl> <list>
# 1 1 2 <tibble [20 x 2]>
# 2 2 2 <tibble [20 x 2]>
# Check if df and df_n are the same
identical(df_n, df)
# [1] TRUE
Using tidyr's nest:
set.seed(1)
df.1 <- data.frame(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
df.2 <- data.frame(name=sample(LETTERS,20,replace = F),score=sample(1:100,20,replace = F))
I can create a tibble where df.1 is nested under id and rank:
library(dplyr)
library(tidyr)
data.frame(id=1,rank=2,data=df.1) %>% nest(-id,-rank)
# A tibble: 1 × 3
id rank data
<dbl> <dbl> <list>
1 1 2 <tibble [20 × 2]>
For having both df.1 and df.2 in a tibble, I'd simply do:
data.frame(id=c(1,2),rank=c(2,1),data=c(df.1,df.2)) %>% nest(-id,-rank)
# A tibble: 2 × 3
id rank data
<dbl> <dbl> <list>
1 1 2 <tibble [10 × 4]>
2 2 1 <tibble [10 × 4]>

Resources