`purrr::map` to any type - r

Is there a way to map to any type with purrr::map
library(tidyverse)
library(lubridate)
df <- data_frame(id = c(1, 1, 1, 2, 2, 2),
val = c(1, 2, 3, 1, 2, 3),
date = ymd("2017-01-01") + days(1:6))
df1 <- df %>% nest(-id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = map(data, ~ .$date[1]))
I would like first_day to be a column of type <date> as in df. I have tried flatten, but this does not work as it coerces the column to numeric.

purrr is type-stable and this takes some getting used to.
In this case, it returns a list where you expect a <date>.
A simple and "stable" solution to you case would be to replace the second map with a map_dbl and have the output turned back to a <date> object using lubridate's as_date, like this:
df3 <- df %>% nest(-id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = as_date(map_dbl(data, ~ .$date[1])))
You get:
# A tibble: 2 × 4
id data first_val first_day
<dbl> <list> <dbl> <date>
1 <tibble [3 × 2]> 1 2017-01-02
2 <tibble [3 × 2]> 1 2017-01-05
Which is what you wanted (for this example).
EDIT: for any other types (other than <date>) you would have to find a different solution, however, the standard types are covered by the dedicated map_lgl, map_dbl, map_chr, etc.

An alternative to the map_dbl() %>% as_date() is to use unnest() on the output column of interest:
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
df <- data_frame(id = c(1, 1, 1, 2, 2, 2),
val = c(1, 2, 3, 1, 2, 3),
date = ymd("2017-01-01") + days(1:6))
df %>% nest(-id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = map(data, ~ .$date[1])) %>%
unnest(first_day)
#> # A tibble: 2 x 4
#> id data first_val first_day
#> <dbl> <list> <dbl> <date>
#> 1 1 <tibble [3 × 2]> 1 2017-01-02
#> 2 2 <tibble [3 × 2]> 1 2017-01-05
Created on 2018-11-17 by the reprex package (v0.2.1)

With purrr 1.0.0, you can use map_vec.
map_vec() (along with map2_vec(), and pmap_vec()) handles more
types of vectors. map_vec() extends map_lgl(), map_int(),
map_dbl(), and map_chr() to arbitrary types of vectors, like
dates, factors, and date-times:
df %>% nest(data = -id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = map_vec(data, ~ .$date[1]))
output
# A tibble: 2 × 4
id data first_val first_day
<dbl> <list> <dbl> <date>
1 1 <tibble [3 × 2]> 1 2017-01-02
2 2 <tibble [3 × 2]> 1 2017-01-05
map_vec will always return a simpler vector with the correct vector class (erroring if there is no common type), but you can also specify it with .ptype:
df %>% nest(data = -id) %>%
mutate(first_val = map_vec(data, ~ .$val[1], .ptype = integer()),
first_day = map_vec(data, ~ .$date[1], .ptype = Date()))

You could rely on purrr's reduce() with c():
library(tidyverse)
library(lubridate)
df <- tibble(id = c(1, 1, 1, 2, 2, 2),
val = c(1, 2, 3, 1, 2, 3),
date = ymd("2017-01-01") + days(1:6))
df1 <- df %>% nest(data = -id) %>%
mutate(first_val = map_dbl(data, ~ .$val[1]),
first_day = reduce(map(data, ~ .$date[1]), c))
Result:
> df1
# A tibble: 2 × 4
id data first_val first_day
<dbl> <list> <dbl> <date>
1 1 <tibble [3 × 2]> 1 2017-01-02
2 2 <tibble [3 × 2]> 1 2017-01-05

Related

Nest a tibble by column prefix

We do a normal nesting grouping by rows. Mine is different.
I want to create a nested tibble grouping by column prefixes (before the first '_'), preserving the original column names in the nested tibbles.
The current approach works but looks overcomplicated.
tibble(a_1=1:3, a_2=2:4, b_1=3:5) %>%
print() %>%
# A tibble: 3 x 3
# a_1 a_2 b_1
# <int> <int> <int>
# 1 1 2 3
# 2 2 3 4
# 3 3 4 5
pivot_longer(everything()) %>%
nest(data=-name) %>%
mutate(data=map2(data, name, ~rename(.x, '{.y}' := value))) %>%
mutate(gr=str_extract(name, '^[^_]+'), .keep='unused') %>%
nest(data=-gr) %>%
mutate(data=map(data, ~bind_cols(.[[1]]))) %>%
print() %>%
# A tibble: 2 x 2
# gr data
# <chr> <list>
# 1 a <tibble [3 x 2]>
# 2 b <tibble [3 x 1]>
{ .$data[[1]] }
# A tibble: 3 x 2
# a_1 a_2
# <int> <int>
# 1 1 2
# 2 2 3
# 3 3 4
UPD: if possible, tidyverse solution
Using a neat little trick I learned lately you could do:
library(tidyr)
library(dplyr, warn = FALSE)
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(., gsub("_[0-9]", "", names(.))) %>%
lapply(nest, data = everything()) %>%
bind_rows(.id = "gr")
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
Another possible solution, based on purrr::map_dfr:
library(tidyverse)
map_dfr(unique(str_remove(names(df), "_\\d+")),
~ tibble(gr = .x, nest(select(df, which(str_detect(names(df), .x))),
data = everything())))
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
my version, a little more modified, tidyversed version of stepan's answer
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(str_extract(names(.), "^[^_]+")) %>%
map(nest, data = everything()) %>%
bind_rows(.id = "gr")
Couldn't find an alternative to split.default()

Calculate correlation for two data frames for all columns after group_by in R

Sample data:
A <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"), a = c(1,2,3,4,5,8), b = c(4,5,6,7,8,2), c = c(9,6,5,43,2,5))
B <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"),a = c(7,4,3,6,9,81), b = c(9,4,3,7,0,35), c = c(22,5,6,2,9,33))
Expected output:
(x is the correlation for the column between two data frames in the region)
I have tried:
Binding two data frames into one and calculate correlation between two columns in one data frame. It is a bit tedious to type every column names, which also creates too many columns. Is there a simpler way to do this?
If my understanding is not off, then here is a solution using dplyr and tidyr.
library(dplyr)
library(tidyr)
rbind(cbind(set = "A", A), cbind(set = "B", B)) %>%
pivot_longer(-c(set, region)) %>%
group_by(region, name) %>%
summarise(value = cor(value[set == "A"], value[set == "B"]), .groups = "drop") %>%
pivot_wider()
Output
# A tibble: 3 x 4
region a b c
<chr> <dbl> <dbl> <dbl>
1 AUS 1 -1 1
2 UK 1 1 -1
3 US -1 -1 1
This is a little convoluted but it's an alternative way to do it.
library(tidyverse)
A <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"), a = c(1,2,3,4,5,8), b = c(4,5,6,7,8,2), c = c(9,6,5,43,2,5))
B <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"),a = c(7,4,3,6,9,81), b = c(9,4,3,7,0,35), c = c(22,5,6,2,9,33))
(df <- map(list(A, B), ~nest_by(.x, region)) %>%
reduce(inner_join, by = 'region'))
#> # A tibble: 3 × 3
#> # Rowwise: region
#> region data.x data.y
#> <chr> <list<tibble[,3]>> <list<tibble[,3]>>
#> 1 AUS [2 × 3] [2 × 3]
#> 2 UK [2 × 3] [2 × 3]
#> 3 US [2 × 3] [2 × 3]
bind_cols(select(df, region), map2_dfr(df$data.x, df$data.y, ~map2_dfc(.x, .y, ~cor(.x, .y))))
#> # A tibble: 3 × 4
#> # Rowwise: region
#> region a b c
#> <chr> <dbl> <dbl> <dbl>
#> 1 AUS 1 -1 1
#> 2 UK 1 1 -1
#> 3 US -1 -1 1
Created on 2022-01-06 by the reprex package (v2.0.1)

Unnest a list-column of tibbles with different data type (cannot combine double and character)

I would like to unnest a nested tibble, however, I get an error.
Example data:
library(tidyverse)
df <- tribble(
~x, ~y,
1, tibble(a=1, b=2),
2, tibble(a=4:5, b=c("thank","you"),c=1:2))
df
#> # A tibble: 2 x 2
#> x y
#> <dbl> <list>
#> 1 1 <tibble [1 x 2]>
#> 2 2 <tibble [2 x 3]>
df %>% unnest(y)
#> Error: Can't combine `..1$b` <double> and `..2$b` <character>.
Created on 2021-11-03 by the reprex package (v2.0.1)
I think I have to change the data type of all tibbles listed in y to character, but I got stuck with that.
Maybe the following?
library(tidyverse)
df <- tribble(
~x, ~y,
1, tibble(a=1, b=2),
2, tibble(a=4:5, b=c("thank","you"),c=1:2))
df %>%
mutate(y = map(y, ~ mutate(.x, b = as.character(b)))) %>%
unnest(cols = c(y))
#> # A tibble: 3 × 4
#> x a b c
#> <dbl> <dbl> <chr> <int>
#> 1 1 1 2 NA
#> 2 2 4 thank 1
#> 3 2 5 you 2
There might be a more elegant way but this works. Turn the tribble into characters and then you can combine. You can most likely just change column 'b' to the same class and it will be fine.
library(tidyverse)
df <- tribble(
~x, ~y,
1, tibble(a=1, b=2),
2, tibble(a=4:5, b=c("thank","you"),c=1:2))
df$y[[1]] <- purrr::map_df(df$y[[1]], as.character)
df$y[[2]] <- purrr::map_df(df$y[[2]], as.character)
df %>% unnest(y)
Starting with the original data all you really need to do is change the class of b in the first nested tibble to character.
df$y[[1]]$b <- as.character(df$y[[1]]$b )
df %>% unnest(y)

How to get grouped Sum with nest in one step

I am unable to get grouped sum in one single step using nest but in 2 steps. How can I use map to loop over data column in the output of nest(). Also suggest a way to include the output column in the existing dataframe.
suppressWarnings(library(tidyverse))
tmp_df <-
data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5))
tmp_df1 = tmp_df %>% group_by(group) %>% nest() #step1
map(tmp_df1$data, sum) #step 2
#> [[1]]
#> [1] -5
#>
#> [[2]]
#> [1] 5
I know how to get the sum using group_by.
suppressWarnings(library(tidyverse))
tmp_df <-
data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5))
tmp_df %>%
group_by(group) %>%
summarise(sum = sum(b))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 2
#> group sum
#> <int> <dbl>
#> 1 1 5
#> 2 2 -5
Created on 2020-08-04 by the reprex package (v0.3.0)
We can use c_across
library(dplyr)
tmp_df %>%
nest_by(group) %>%
mutate(sum = sum(c_across(data)))
-output
# A tibble: 2 x 3
# Rowwise: group
# group data sum
# <int> <list<tbl_df[,1]>> <dbl>
#1 1 [5 × 1] 5
#2 2 [5 × 1] -5
Or just
tmp_df %>%
nest_by(group) %>%
mutate(sum = sum(data))
If you want to use nest you can try map_dbl :
library(tidyverse)
tmp_df %>%
group_by(group) %>%
nest() %>%
mutate(sum = map_dbl(data, ~.x %>% sum))
# group data sum
# <int> <list> <dbl>
#1 2 <tibble [5 × 1]> -5
#2 1 <tibble [5 × 1]> 5

Why do group_by and group_by_ give different answers when summarizing by two variables?

In the following example, I want to create a summary statistic by two variables. When I do it with dplyr::group_by, I get the correct answer, by when I do it with dplyr::group_by_, it summarizes one level more than I want it to.
library(dplyr)
set.seed(919)
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2),
b = c(3, 3, 4, 4, 5, 5),
x = runif(6)
)
# Gives correct answer
df %>%
group_by(a, b) %>%
summarize(total = sum(x))
# Source: local data frame [4 x 3]
# Groups: a [?]
#
# a b total
# <dbl> <dbl> <dbl>
# 1 1 3 1.5214746
# 2 1 4 0.7150204
# 3 2 4 0.1234555
# 4 2 5 0.8208454
# Wrong answer -- too many levels summarized
df %>%
group_by_(c("a", "b")) %>%
summarize(total = sum(x))
# # A tibble: 2 × 2
# a total
# <dbl> <dbl>
# 1 1 2.2364950
# 2 2 0.9443009
What's going on?
If you want to use a vector of variable names, you can pass it to .dots parameter as:
df %>%
group_by_(.dots = c("a", "b")) %>%
summarize(total = sum(x))
#Source: local data frame [4 x 3]
#Groups: a [?]
# a b total
# <dbl> <dbl> <dbl>
#1 1 3 1.5214746
#2 1 4 0.7150204
#3 2 4 0.1234555
#4 2 5 0.8208454
Or you can use it in the same way as you would do in NSE way:
df %>%
group_by_("a", "b") %>%
summarize(total = sum(x))
#Source: local data frame [4 x 3]
#Groups: a [?]
# a b total
# <dbl> <dbl> <dbl>
#1 1 3 1.5214746
#2 1 4 0.7150204
#3 2 4 0.1234555
#4 2 5 0.8208454

Resources