I am unable to get grouped sum in one single step using nest but in 2 steps. How can I use map to loop over data column in the output of nest(). Also suggest a way to include the output column in the existing dataframe.
suppressWarnings(library(tidyverse))
tmp_df <-
data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5))
tmp_df1 = tmp_df %>% group_by(group) %>% nest() #step1
map(tmp_df1$data, sum) #step 2
#> [[1]]
#> [1] -5
#>
#> [[2]]
#> [1] 5
I know how to get the sum using group_by.
suppressWarnings(library(tidyverse))
tmp_df <-
data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5))
tmp_df %>%
group_by(group) %>%
summarise(sum = sum(b))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 2
#> group sum
#> <int> <dbl>
#> 1 1 5
#> 2 2 -5
Created on 2020-08-04 by the reprex package (v0.3.0)
We can use c_across
library(dplyr)
tmp_df %>%
nest_by(group) %>%
mutate(sum = sum(c_across(data)))
-output
# A tibble: 2 x 3
# Rowwise: group
# group data sum
# <int> <list<tbl_df[,1]>> <dbl>
#1 1 [5 × 1] 5
#2 2 [5 × 1] -5
Or just
tmp_df %>%
nest_by(group) %>%
mutate(sum = sum(data))
If you want to use nest you can try map_dbl :
library(tidyverse)
tmp_df %>%
group_by(group) %>%
nest() %>%
mutate(sum = map_dbl(data, ~.x %>% sum))
# group data sum
# <int> <list> <dbl>
#1 2 <tibble [5 × 1]> -5
#2 1 <tibble [5 × 1]> 5
Related
We do a normal nesting grouping by rows. Mine is different.
I want to create a nested tibble grouping by column prefixes (before the first '_'), preserving the original column names in the nested tibbles.
The current approach works but looks overcomplicated.
tibble(a_1=1:3, a_2=2:4, b_1=3:5) %>%
print() %>%
# A tibble: 3 x 3
# a_1 a_2 b_1
# <int> <int> <int>
# 1 1 2 3
# 2 2 3 4
# 3 3 4 5
pivot_longer(everything()) %>%
nest(data=-name) %>%
mutate(data=map2(data, name, ~rename(.x, '{.y}' := value))) %>%
mutate(gr=str_extract(name, '^[^_]+'), .keep='unused') %>%
nest(data=-gr) %>%
mutate(data=map(data, ~bind_cols(.[[1]]))) %>%
print() %>%
# A tibble: 2 x 2
# gr data
# <chr> <list>
# 1 a <tibble [3 x 2]>
# 2 b <tibble [3 x 1]>
{ .$data[[1]] }
# A tibble: 3 x 2
# a_1 a_2
# <int> <int>
# 1 1 2
# 2 2 3
# 3 3 4
UPD: if possible, tidyverse solution
Using a neat little trick I learned lately you could do:
library(tidyr)
library(dplyr, warn = FALSE)
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(., gsub("_[0-9]", "", names(.))) %>%
lapply(nest, data = everything()) %>%
bind_rows(.id = "gr")
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
Another possible solution, based on purrr::map_dfr:
library(tidyverse)
map_dfr(unique(str_remove(names(df), "_\\d+")),
~ tibble(gr = .x, nest(select(df, which(str_detect(names(df), .x))),
data = everything())))
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
my version, a little more modified, tidyversed version of stepan's answer
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(str_extract(names(.), "^[^_]+")) %>%
map(nest, data = everything()) %>%
bind_rows(.id = "gr")
Couldn't find an alternative to split.default()
I have a list of data frames, each with two columns named "place" and "data".
"place" is a character and "data" is a nested data frame with one numeric column named "value".
For each data frame from the list, I'd like to rename the "value" column of the nested data frame with the value of "place" column.
library(tidyverse)
some_dt = tibble(place = c("a","a", "b","b","c","c"),
value = c(1,2,1,4,5,6))
# here is a list of data frames...
ls_df <-
some_dt %>%
group_by(place) %>%
nest() %>%
split(.$place)
I'm tried:
map2(ls_df$data,
ls_df$place,
~rename(.x, .y = "value"))
or:
map2(ls_df$data,
ls_df$place,
~rename_with(.x, ~ .y, "value"))
but I'm getting an empty list as result.
How can I rename the "value" column with the content of the outer data frame column?
We may loop over the list ('ls_df') with map, extract the 'place' column and then rename the extracted 'data' column with the 'place' value
library(dplyr)
library(purrr)
ls_df2 <- map(ls_df, ~ {
nm <- .x$place
.x$data[[1]] <- .x$data[[1]] %>%
rename_with(~ nm, "value")
.x
})
-checking
> map(ls_df2, ~ .x$data)
$a
$a[[1]]
# A tibble: 2 × 1
a
<dbl>
1 1
2 2
$b
$b[[1]]
# A tibble: 2 × 1
b
<dbl>
1 1
2 4
$c
$c[[1]]
# A tibble: 2 × 1
c
<dbl>
1 5
2 6
Note that when we are splitting the data, it returns a list. Therefore, we cannot access the columns 'data' directly i.e
> ls_df$data
NULL
> ls_df$place
NULL
Or another option is
some_dt %>%
nest_by(place) %>%
mutate(data = data %>%
rename_with(~ place, value) %>%
list(.)) %>%
ungroup
# A tibble: 3 × 2
place data
<chr> <list>
1 a <tibble [2 × 1]>
2 b <tibble [2 × 1]>
3 c <tibble [2 × 1]>
You can also iterate over each list element and then use mutate to rename the nested data frame using the place.
ls_df %>%
modify(~ mutate(.x,
data = map(data,
~ set_names(.x, first(place)))))
In this case, you can actually simplify this further.
ls_df %>%
modify(~ mutate(.x,
data = map2(data, place, set_names)))
# which can collapse down to as simple as this
ls_df %>%
modify(mutate, data = map2(data, place, set_names))
With that approach, you can actually consider whether you actually need the list. The nested tibble may be easier to work with directly.
ls_df %>%
bind_rows() %>%
mutate(data = map2(data, place, set_names))
You could also try something like this:
library(tidyverse)
map(ls_df,
~ map2(.x$place,
.x$data,
~rename(.y,
!!sym(.x) := value)
)
)
#> $a
#> $a[[1]]
#> # A tibble: 2 x 1
#> a
#> <dbl>
#> 1 1
#> 2 2
#>
#>
#> $b
#> $b[[1]]
#> # A tibble: 2 x 1
#> b
#> <dbl>
#> 1 1
#> 2 4
#>
#>
#> $c
#> $c[[1]]
#> # A tibble: 2 x 1
#> c
#> <dbl>
#> 1 5
#> 2 6
You can create a function which renames using base colnames() then map that over all the list elements as follows:
# The fn:
rnm <- function(x) {
colnames(x$data[[1]]) <- x$place
x
}
# Result:
res <- ls_df |> purrr::map(.f = rnm)
# Check if it's the desired output:
res$a$data
# [[1]]
# A tibble: 2 × 1
# a
# <dbl>
# 1 1
# 2 2
This is a follow-up to this question.
I need to be able to group_by() columns in my new nested table. I can't find a purrr function that is does this (although I know a solution exists). I need to group_by in each table to apply additional summarizing functions and fit linear models appropriate. The example here is just a dummy example.
library(tidyverse)
set.seed(2)
N <- 30
df <- tibble(type = rep(c("small","medium","high"), each=N/3),
dummy = rep(c(1,5,10),each=10),
xvals = rep(1:10,3),
A = rnorm(N)*dummy,
B = rnorm(N)*dummy,
C = rnorm(N)*dummy) %>%
mutate(type = factor(type, levels=c("small","medium","high"))) %>%
select(-dummy) %>%
pivot_longer(cols=-c(type,xvals), names_to="metric", values_to = "value") %>%
group_by(type) %>%
group_nest(.key="data")
This produces a tibble with two columns:
df
# A tibble: 3 x 2
type data
<fct> <list>
1 small <tibble [30 x 3]>
2 medium <tibble [30 x 3]>
3 high <tibble [30 x 3]>
This is an example of what I want to do across all the nested tibbles:
df[[2]][[1]] %>%
group_by(metric) %>%
summarize(mean = mean(value))
# A tibble: 3 x 2
metric mean
<chr> <dbl>
1 A 0.211
2 B -0.296
3 C -0.391
After the group_nest, the 'data' is a list column of tibbles and there are only two columns 'type' and 'data'. If we need to create a grouping based on the list column, loop through the list with map and then do the group_by
library(dplyr)
library(tidyr)
library(purrr)
df %>%
mutate(data = map(data, ~ .x %>%
group_by(metric) %>%
summarize(mean = mean(value)))) -> out
out$data[[1]]
# A tibble: 3 x 2
# metric mean
# <chr> <dbl>
#1 A 0.115
#2 B 0.323
#3 C -0.326
NOTE: Output values will be different as there was not set seed
Is there a way to map to any type with purrr::map
library(tidyverse)
library(lubridate)
df <- data_frame(id = c(1, 1, 1, 2, 2, 2),
val = c(1, 2, 3, 1, 2, 3),
date = ymd("2017-01-01") + days(1:6))
df1 <- df %>% nest(-id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = map(data, ~ .$date[1]))
I would like first_day to be a column of type <date> as in df. I have tried flatten, but this does not work as it coerces the column to numeric.
purrr is type-stable and this takes some getting used to.
In this case, it returns a list where you expect a <date>.
A simple and "stable" solution to you case would be to replace the second map with a map_dbl and have the output turned back to a <date> object using lubridate's as_date, like this:
df3 <- df %>% nest(-id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = as_date(map_dbl(data, ~ .$date[1])))
You get:
# A tibble: 2 × 4
id data first_val first_day
<dbl> <list> <dbl> <date>
1 <tibble [3 × 2]> 1 2017-01-02
2 <tibble [3 × 2]> 1 2017-01-05
Which is what you wanted (for this example).
EDIT: for any other types (other than <date>) you would have to find a different solution, however, the standard types are covered by the dedicated map_lgl, map_dbl, map_chr, etc.
An alternative to the map_dbl() %>% as_date() is to use unnest() on the output column of interest:
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
df <- data_frame(id = c(1, 1, 1, 2, 2, 2),
val = c(1, 2, 3, 1, 2, 3),
date = ymd("2017-01-01") + days(1:6))
df %>% nest(-id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = map(data, ~ .$date[1])) %>%
unnest(first_day)
#> # A tibble: 2 x 4
#> id data first_val first_day
#> <dbl> <list> <dbl> <date>
#> 1 1 <tibble [3 × 2]> 1 2017-01-02
#> 2 2 <tibble [3 × 2]> 1 2017-01-05
Created on 2018-11-17 by the reprex package (v0.2.1)
With purrr 1.0.0, you can use map_vec.
map_vec() (along with map2_vec(), and pmap_vec()) handles more
types of vectors. map_vec() extends map_lgl(), map_int(),
map_dbl(), and map_chr() to arbitrary types of vectors, like
dates, factors, and date-times:
df %>% nest(data = -id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = map_vec(data, ~ .$date[1]))
output
# A tibble: 2 × 4
id data first_val first_day
<dbl> <list> <dbl> <date>
1 1 <tibble [3 × 2]> 1 2017-01-02
2 2 <tibble [3 × 2]> 1 2017-01-05
map_vec will always return a simpler vector with the correct vector class (erroring if there is no common type), but you can also specify it with .ptype:
df %>% nest(data = -id) %>%
mutate(first_val = map_vec(data, ~ .$val[1], .ptype = integer()),
first_day = map_vec(data, ~ .$date[1], .ptype = Date()))
You could rely on purrr's reduce() with c():
library(tidyverse)
library(lubridate)
df <- tibble(id = c(1, 1, 1, 2, 2, 2),
val = c(1, 2, 3, 1, 2, 3),
date = ymd("2017-01-01") + days(1:6))
df1 <- df %>% nest(data = -id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = reduce(map(data, ~ .$date[1]), c))
Result:
> df1
# A tibble: 2 × 4
id data first_val first_day
<dbl> <list> <dbl> <date>
1 1 <tibble [3 × 2]> 1 2017-01-02
2 2 <tibble [3 × 2]> 1 2017-01-05
I have this tibble which has a list column with vectors in them
df <- data_frame(grp = c("A", "A", "B", "B"),
x = rep(c(list(c(1,2,3)), list(c(4,5,6))), 2))
What I would like to do (preferably within tidyverse) is to perform element wise addition of the vectors inside the lists,
essentially:
c(1,2,3) + c(4,5,6)
# [1] 5 7 9
This:
# A tibble: 4 × 2
grp x
<chr> <list>
A list(c(1,2,3))
A list(c(4,5,6))
B list(c(1,2,3))
B list(c(4,5,6))
Becomes:
# A tibble: 2 × 2
grp y
<chr> <list>
A list(c(5,7,9))
B list(c(5,7,9))
What might be a good approach?
The following should also get you what you need:
dff %>% group_by(grp) %>%
summarise(x = list(Reduce("+",x))) %>%
ungroup()
I hope this helps.
We can try
library(dplyr)
library(tidyr)
r1 <- lengths(df$x)[1]
unnest(df) %>%
group_by(grp) %>%
mutate(grp1 = rep(seq(r1), 2)) %>%
group_by(grp1, add = TRUE) %>%
summarise(x = sum(x)) %>%
group_by(grp) %>%
summarise(x= list(x))
# A tibble: 2 × 2
# grp x
# <chr> <list>
#1 A <dbl [3]>
#2 B <dbl [3]>