I need to accumulate a value over nested tibbles.
Here is my simplified example:
tab <- data.frame(A = c("A","B","A","B"),
ID = c(1,1,2,2),
V1 = c(20,40,25,30),
V2 = c(0.2,0.8,0.3,0.7)
) %>%
group_by(ID) %>%
nest()
In my solution, the accumulate function use as initial value always 1000 instead of the accumulated "1000".
tab %>%
mutate(
G_i = purrr::accumulate(data,
function(G_i,data){
out <- data %>%
group_by(A) %>%
mutate(G_i = G_i+G_i*V2/V1)
sum(out$G_i)
},
.init = 1000)%>%
tail(-1)
)
A tibble: 2 × 3
# Groups: ID [2]
ID data G_i
<dbl> <list> <dbl>
1 1 <tibble [2 × 3]> 2030
2 2 <tibble [2 × 3]> 2035.
The desired output should deliver:
A tibble: 2 × 3
# Groups: ID [2]
ID data G_i
<dbl> <list> <dbl>
1 1 <tibble [2 × 3]> 2030
2 2 <tibble [2 × 3]> 4131.727
Thank you for your help.
As noted by #MikkoMarttila, all you need need to do is ungroup prior to using accumulate.
library(tidyverse)
tab %>%
ungroup %>%
mutate(
G_i = purrr::accumulate(data,
function(G_i,data){
out <- data %>%
group_by(A) %>%
mutate(G_i = G_i+G_i*V2/V1)
sum(out$G_i)
},
.init = 1000)%>%
tail(-1)
)
Output
ID data G_i
<dbl> <list> <dbl>
1 1 <tibble [2 × 3]> 2030.000
2 2 <tibble [2 × 3]> 4131.727
Related
We do a normal nesting grouping by rows. Mine is different.
I want to create a nested tibble grouping by column prefixes (before the first '_'), preserving the original column names in the nested tibbles.
The current approach works but looks overcomplicated.
tibble(a_1=1:3, a_2=2:4, b_1=3:5) %>%
print() %>%
# A tibble: 3 x 3
# a_1 a_2 b_1
# <int> <int> <int>
# 1 1 2 3
# 2 2 3 4
# 3 3 4 5
pivot_longer(everything()) %>%
nest(data=-name) %>%
mutate(data=map2(data, name, ~rename(.x, '{.y}' := value))) %>%
mutate(gr=str_extract(name, '^[^_]+'), .keep='unused') %>%
nest(data=-gr) %>%
mutate(data=map(data, ~bind_cols(.[[1]]))) %>%
print() %>%
# A tibble: 2 x 2
# gr data
# <chr> <list>
# 1 a <tibble [3 x 2]>
# 2 b <tibble [3 x 1]>
{ .$data[[1]] }
# A tibble: 3 x 2
# a_1 a_2
# <int> <int>
# 1 1 2
# 2 2 3
# 3 3 4
UPD: if possible, tidyverse solution
Using a neat little trick I learned lately you could do:
library(tidyr)
library(dplyr, warn = FALSE)
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(., gsub("_[0-9]", "", names(.))) %>%
lapply(nest, data = everything()) %>%
bind_rows(.id = "gr")
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
Another possible solution, based on purrr::map_dfr:
library(tidyverse)
map_dfr(unique(str_remove(names(df), "_\\d+")),
~ tibble(gr = .x, nest(select(df, which(str_detect(names(df), .x))),
data = everything())))
#> # A tibble: 2 × 2
#> gr data
#> <chr> <list>
#> 1 a <tibble [3 × 2]>
#> 2 b <tibble [3 × 1]>
my version, a little more modified, tidyversed version of stepan's answer
tibble(a_1 = 1:3, a_2 = 2:4, b_1 = 3:5) %>%
split.default(str_extract(names(.), "^[^_]+")) %>%
map(nest, data = everything()) %>%
bind_rows(.id = "gr")
Couldn't find an alternative to split.default()
I am trying to create a new nested column using the data from the min and max values of another nested column.
If I nest the IRIS data by Species and want to create a new nested data frame by the min and max of the Petal.Length for each Species how would I do it?
My code so far, create a function to create a new data.frame or expand.grid, then apply it using mutate(...map(...
Code/Data:
func = function(input){
data.frame(
min_to_max = seq(
from = min(.x$Petal.Length),
to = max(.x$Petal.Length),
by = 1
)
)
}
iris %>%
group_by(Species) %>%
nest() %>%
mutate(
expandDF = map(data, ~ func(.x))
)
The function should have match the argument name used i.e. input and not .x
func <- function(input){
data.frame(
min_to_max = seq(
from = min(input$Petal.Length),
to = max(input$Petal.Length),
by = 1
)
)
}
-testing
iris %>%
group_by(Species) %>%
nest() %>%
mutate(
expandDF = map(data, ~ func(.x))
) %>% ungroup
-output
# A tibble: 3 × 3
Species data expandDF
<fct> <list> <list>
1 setosa <tibble [50 × 4]> <df [1 × 1]>
2 versicolor <tibble [50 × 4]> <df [3 × 1]>
3 virginica <tibble [50 × 4]> <df [3 × 1]>
We could also do this without using map i.e with nest_by
iris %>%
nest_by(Species) %>%
mutate(expandDF = list(data.frame(min_to_max =
seq(from = min(data$Petal.Length), to = max(data$Petal.Length))))) %>%
ungroup
# A tibble: 3 × 3
Species data expandDF
<fct> <list<tibble[,4]>> <list>
1 setosa [50 × 4] <df [1 × 1]>
2 versicolor [50 × 4] <df [3 × 1]>
3 virginica [50 × 4] <df [3 × 1]>
Sample Data
ex_list <- list(a = tibble(x = 1:4, y = 5:8),
b = mtcars)
How do I convert this list of tibbles/dataframes into a nested tibble as shown below:
# A tibble: 2 x 2
data_name data
<chr> <list>
1 a <tibble [4 × 2]>
2 b <df [32 × 11]>
Tidy solutions appreciated!
We may use enframe
library(tibble)
enframe(ex_list)
# A tibble: 2 x 2
name value
<chr> <list>
1 a <tibble [4 × 2]>
2 b <df [32 × 11]>
If we need to change the column names, use the name and value
> enframe(ex_list, name = 'data_name', value = 'data')
# A tibble: 2 x 2
data_name data
<chr> <list>
1 a <tibble [4 × 2]>
2 b <df [32 × 11]>
Is this what you want?
library(tidyverse)
lapply(ex_list, nest) %>%
dplyr::bind_rows(., .id = "data_name")
# # A tibble: 2 x 2
# data_name data
# <chr> <list>
# 1 a <tibble [4 x 2]>
# 2 b <tibble [32 x 11]>
#OR map
#map(ex_list, nest) %>%
# bind_rows(., .id = "data_name")
I have a dataframe which contains duplicate values in a list column and I want to keep only the first appearence of each unique value.
Let's say I have the following tibble:
df <- tribble(
~x, ~y,
1, tibble(a = 1:2, b = 2:3),
2, tibble(a = 1:2, b = 2:3),
3, tibble(a = 0:1, b = 0:1)
)
df
#> # A tibble: 3 x 2
#> x y
#> <dbl> <list>
#> 1 1 <tibble [2 x 2]>
#> 2 2 <tibble [2 x 2]>
#> 3 3 <tibble [2 x 2]>
The desired outcome is:
desired_df
#> # A tibble: 2 x 2
#> x y
#> <dbl> <list>
#> 1 1 <tibble [2 x 2]>
#> 2 3 <tibble [2 x 2]>
Wasn't y a list column I'd be able to use distinct(df, y, .keep_all = TRUE), but the fuction doesn't support list columns properly, as shown:
distinct(df, y, .keep_all = TRUE)
#> Warning: distinct() does not fully support columns of type `list`.
#> List elements are compared by reference, see ?distinct for details.
#> This affects the following columns:
#> - `y`
#> # A tibble: 3 x 2
#> x y
#> <dbl> <list>
#> 1 1 <tibble [2 x 2]>
#> 2 2 <tibble [2 x 2]>
#> 3 3 <tibble [2 x 2]>
Is there any "clean" way to achieve what I want?
One option is to use filter with duplicated
library(dplyr)
df %>%
filter(!duplicated(y))
I have come to an answer, but I think it's quite "wordy" (and I suspect it might be slow as well):
df <- df %>%
mutate(unique_list_id = match(y, unique(y))) %>%
group_by(unique_list_id) %>%
slice(1) %>%
ungroup() %>%
select(-unique_list_id)
df
#> # A tibble: 2 x 2
#> x y
#> <dbl> <list>
#> 1 1 <tibble [2 x 2]>
#> 2 3 <tibble [2 x 2]>
Here's a dumb example dataframe:
df <- data_frame(A = c(rep(1, 5), rep(2, 4)), B = 1:9) %>%
group_by(A) %>%
nest()
which looks like this:
> df
# A tibble: 2 × 2
A data
<dbl> <list>
1 1 <tibble [5 × 1]>
2 2 <tibble [4 × 1]>
I would like to add a third column called N with entries equal to the number of rows in each nested data_frame in data. I figured this would work:
> df %>%
+ mutate(N = nrow(data))
Error: Unsupported type NILSXP for column "N"
What's going wrong?
Combining dplyr and purrr you could do:
library(tidyverse)
df %>%
mutate(n = map_dbl(data, nrow))
#> # A tibble: 2 × 3
#> A data n
#> <dbl> <list> <dbl>
#> 1 1 <tibble [5 × 1]> 5
#> 2 2 <tibble [4 × 1]> 4
I like this approach, because you stay within your usual workflow, creating a new column within mutate, but leveraging the map_*-family, since you need to operate on a list.
You could do:
df %>%
rowwise() %>%
mutate(N = nrow(data))
Which gives:
#Source: local data frame [2 x 3]
#Groups: <by row>
#
## A tibble: 2 × 3
# A data N
# <dbl> <list> <int>
#1 1 <tibble [5 × 1]> 5
#2 2 <tibble [4 × 1]> 4
With dplyr:
df %>%
group_by(A) %>%
mutate(N = nrow(data.frame(data)))
A data N
<dbl> <list> <int>
1 1 <tibble [5 × 1]> 5
2 2 <tibble [4 × 1]> 4