consecutive grouped ID list R - r

If I have a df and want to do a grouped ID i would do:
df <- data.frame(id= rep(c(1,8,4), each = 3), score = runif(9))
df %>% group_by(id) %>% mutate(ID = cur_group_id())
following(How to create a consecutive group number answer of #Ronak Shah).
Now I have a list of those dfs and want to give consecutive group numbers, but they shall not start in every lists element new. In other words the ID column in listelement is 1 to 10, and in list two 11 to 15 and so on (so I can´t simply run the same code with lapply).
I guess I could do something like:
names(df)<-c("a", "b")
df<- mapply(cbind,df, "list"=names(df), SIMPLIFY=F)
df <- do.call(rbind, list)
df<-df %>% group_by(id) %>% mutate(ID = cur_group_id())
split(df, list)
but maybe some have more direct, clever ways?

A dplyr way could be using bind_rows as group_split (experimental):
library(dplyr)
df_list |>
bind_rows(.id = "origin") |>
mutate(ID = consecutive_id(id)) |> # If dplyr v.<1.1.0, use ID = cumsum(!duplicated(id))
group_split(origin, .keep = FALSE)
Output:
[[1]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 1 0.187 1
2 1 0.232 1
3 1 0.317 1
4 8 0.303 2
5 8 0.159 2
6 8 0.0400 2
7 4 0.219 3
8 4 0.811 3
9 4 0.526 3
[[2]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 3 0.915 4
2 3 0.831 4
3 3 0.0458 4
4 5 0.456 5
5 5 0.265 5
6 5 0.305 5
7 2 0.507 6
8 2 0.181 6
9 2 0.760 6
Data:
set.seed(1234)
df1 <- tibble(id = rep(c(1,8,4), each = 3), score = runif(9))
df2 <- tibble(id = rep(c(3,5,2), each = 3), score = runif(9))
df_list <- list(df1, df2)
Or using cur_group_id() for the group number, this approach, however, gives another order than you expect in your question:
library(dplyr)
df_list |>
bind_rows(.id = "origin") |>
mutate(ID = cur_group_id(), .by = "id") |> # If dplyr v.<1.1.0, use group_by()-notation
group_split(origin, .keep = FALSE)
Output:
[[1]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 1 0.187 1
2 1 0.232 1
3 1 0.317 1
4 8 0.303 6
5 8 0.159 6
6 8 0.0400 6
7 4 0.219 4
8 4 0.811 4
9 4 0.526 4
[[2]]
# A tibble: 9 × 3
id score ID
<dbl> <dbl> <int>
1 3 0.915 3
2 3 0.831 3
3 3 0.0458 3
4 5 0.456 5
5 5 0.265 5
6 5 0.305 5
7 2 0.507 2
8 2 0.181 2
9 2 0.760 2

Related

How I can calculate correlation between each variable within each group in R using dplyr package?

Let's say i have data frame in R that looks like this :
var2 = c(rep("A",3),rep("B",3),rep("C",3),rep("D",3),rep("E",3),rep("F",3),
rep("H",3),rep("I",3))
y2 = c(-1.23, -0.983, 1.28, -0.268, -0.46, -1.23,
1.87, 0.416, -1.99, 0.289, 1.7, -0.455,
-0.648, 0.376, -0.887,0.534,-0.679,-0.923,
0.987,0.324,-0.783,-0.679,0.326,0.998);length(y2)
group2 = c(rep(1,6),rep(2,6),rep(3,6),rep(1,6))
data2 = tibble(var2,group2,y2)
with output :
# A tibble: 24 × 3
var2 group2 y2
<chr> <dbl> <dbl>
1 A 1 -1.23
2 A 1 -0.983
3 A 1 1.28
4 B 1 -0.268
5 B 1 -0.46
6 B 1 -1.23
7 C 2 1.87
8 C 2 0.416
9 C 2 -1.99
10 D 2 0.289
11 D 2 1.7
12 D 2 -0.455
13 E 3 -0.648
14 E 3 0.376
15 E 3 -0.887
16 F 3 0.534
17 F 3 -0.679
18 F 3 -0.923
19 H 1 0.987
20 H 1 0.324
21 H 1 -0.783
22 I 1 -0.679
23 I 1 0.326
24 I 1 0.998
i want to calculate the correlation of each distinct pair in R within each group using dplyr.
Ideally i want the resulted tibble to look like this (the 4th column to contain the values of each correlation pair):
which ideally must look like this :
group
var1
var2
value
1
A
B
cor(A,B)
1
A
H
cor(A,H)
1
A
I
cor(A,I)
1
B
H
cor(B,H)
1
B
I
cor(B,I)
1
H
I
cor(H,I)
2
C
D
cor(C,D)
3
E
F
cor(E,F)
How i can do that in R ?
Any help ?
A possible solution:
library(tidyverse)
data2 %>%
group_by(group2) %>%
group_split() %>%
map(\(x) x %>% group_by(var2) %>%
group_map(~ data.frame(.x[-1]) %>% set_names(.y)) %>%
bind_cols() %>% cor %>%
{data.frame(row = rownames(.)[row(.)[upper.tri(.)]],
col = colnames(.)[col(.)[upper.tri(.)]],
corr = .[upper.tri(.)])}) %>%
imap_dfr(~ data.frame(group = .y, .x))
#> group row col corr
#> 1 1 A B -0.9949738
#> 2 1 A H -0.9581357
#> 3 1 B H 0.9819901
#> 4 1 A I 0.8533855
#> 5 1 B I -0.9012948
#> 6 1 H I -0.9669093
#> 7 2 C D 0.4690460
#> 8 3 E F -0.1864518
if you are okay with repeating the functions you can do:
fun <- function(x, y){
a <- split(x, y)
col1 <- combn(names(a), 2, paste, collapse = '_')
col2 <- combn(unname(a), 2, do.call, what='cor')
data.frame(vars = col1, cor = col2)
}
data2 %>%
group_by(group2)%>%
summarise(fun(y2, var2), .groups = 'drop')
# A tibble: 8 x 3
# Groups: group2 [3]
group2 vars cor
<dbl> <chr> <dbl>
1 1 A_B -0.995
2 1 A_H -0.958
3 1 A_I 0.853
4 1 B_H 0.982
5 1 B_I -0.901
6 1 H_I -0.967
7 2 C_D 0.469
8 3 E_F -0.186
If you do not want to repeat the functions as the process might be expensive, you can do:
data2 %>%
group_by(group2)%>%
summarise(s=combn(split(y2, var2), 2,
\(x)stack(setNames(cor(x[[1]], x[[2]]), paste(names(x), collapse='_'))),
simplify = FALSE),.groups = 'drop') %>%
unnest(s)
# A tibble: 8 x 3
group2 values ind
<dbl> <dbl> <fct>
1 1 -0.995 A_B
2 1 -0.958 A_H
3 1 0.853 A_I
4 1 0.982 B_H
5 1 -0.901 B_I
6 1 -0.967 H_I
7 2 0.469 C_D
8 3 -0.186 E_F
Another option would be widyr::pairwise_cor which requires to first add an identifier for the "observation":
library(widyr)
library(dplyr)
data2 %>%
group_by(var2, group2) %>%
mutate(obs = row_number()) |>
ungroup() %>%
split(.$group2) %>%
lapply(function(x) widyr::pairwise_cor(x, var2, obs, y2, upper = FALSE)) %>%
bind_rows(.id = "group2")
#> # A tibble: 8 × 4
#> group2 item1 item2 correlation
#> <chr> <chr> <chr> <dbl>
#> 1 1 A B -0.995
#> 2 1 A H -0.958
#> 3 1 B H 0.982
#> 4 1 A I 0.853
#> 5 1 B I -0.901
#> 6 1 H I -0.967
#> 7 2 C D 0.469
#> 8 3 E F -0.186

Join current and previous dataframes in nested tibble using lag() and mutate() to produce a new list-column

I am trying to determine the difference between the set of ids in subsequent pairs of dataframes. The dataframes are derived from an original dataframe split by a grouping variable representing the time period. The results should show the rows of the new ids that occur in the current time period compared to the previous one.
I can accomplish this with a list of dataframes:
library(tidyverse)
set.seed(999)
examp <- tibble(
id = c(replicate(4, sample.int(20, 9))),
year = rep(1:4, each = 9),
val = runif(36)
)
examp %>%
split(.$year) %>%
# note my default, I compare the first year to itself
map2(lag(., default = .[1]), anti_join, by = "id")
$`1`
# A tibble: 0 x 3
# ... with 3 variables: id <int>, year <int>, val <dbl>
$`2`
# A tibble: 3 x 3
id year val
<int> <int> <dbl>
1 5 2 0.450
2 11 2 0.943
3 2 2 0.571
$`3`
# A tibble: 6 x 3
id year val
<int> <int> <dbl>
1 19 3 0.870
2 12 3 0.403
3 9 3 0.331
4 20 3 0.315
5 16 3 0.455
6 17 3 0.699
$`4`
# A tibble: 5 x 3
id year val
<int> <int> <dbl>
1 4 4 0.190
2 11 4 0.0804
3 2 4 0.247
4 1 4 0.619
5 18 4 0.434
But I could not get the same to work using mutate in a nested dataframe:
examp %>%
nest_by(year) %>%
mutate(new = anti_join(data, lag(data), by = "id"))
# A tibble: 4 x 3
# Rowwise: year
year data new$id $val
<int> <list<tibble[,2]>> <int> <dbl>
1 1 [9 x 2] 3 0.0601
2 2 [9 x 2] 1 0.495
3 3 [9 x 2] 17 0.699
4 4 [9 x 2] 18 0.434
Here I could not figure out how to specify the default and the output is unexpected. I expected "new" to be a list-column of dataframes corresponding with those above, which I could then unnest.
I am interested in learning more about working with nested dataframes and any help understanding how to get this to work would be much appreciated. Additionally, if there is another (simple) solution to this general problem, I would be happy to learn about it.
It should be wrapped in a list
library(dplyr)
out <- examp %>%
nest_by(year) %>%
ungroup %>%
mutate(newdat = lag(data, default = data[1])) %>%
rowwise %>%
mutate(new = list(anti_join(data, newdat, by = 'id')))
-output
out$new
[[1]]
# A tibble: 0 x 2
# … with 2 variables: id <int>, val <dbl>
[[2]]
# A tibble: 3 x 2
id val
<int> <dbl>
1 5 0.450
2 11 0.943
3 2 0.571
[[3]]
# A tibble: 6 x 2
id val
<int> <dbl>
1 19 0.870
2 12 0.403
3 9 0.331
4 20 0.315
5 16 0.455
6 17 0.699
[[4]]
# A tibble: 5 x 2
id val
<int> <dbl>
1 4 0.190
2 11 0.0804
3 2 0.247
4 1 0.619
5 18 0.434

Creating a new column using the previous value of a different column and the previous value of itself

how can I create a new column which starting value is 1 and the following values are a multiplication of the previous value of a column (b) and the previous value of itself (d)?
these data are only made up, but have the structure of my data:
> a <- rep(1:10, 3)
> b <- runif(30)
> c <- tibble(a,b)
> c
# A tibble: 30 x 2
a b
<int> <dbl>
1 1 0.945
2 2 0.280
3 3 0.464
4 4 0.245
5 5 0.917
6 6 0.913
7 7 0.144
8 8 0.481
9 9 0.873
10 10 0.754
# ... with 20 more rows
Then I try to calculate column d:
> c <- c %>%
+ group_by(a) %>%
+ mutate(d = accumulate(lag(b, k = 1), `*`, .init = 1))
and it should look like this
# A tibble: 30 x 3
# Groups: a [10]
a b d
<int> <dbl> <dbl>
1 1 0.945 1 <--- b[1] * d[1] = d[2]
2 2 0.280 0.945
3 3 0.464 0.265
4 4 0.245 0.123
5 5 0.917 0.03
#...
But instead I am getting this error message.
Fehler: Column `d` must be length 3 (the group size) or one, not 4
The problem is that when you initialize accumulate with .init = that adds an extra first element of the vector.
You could try this:
library(dplyr)
library(purrr)
c %>%
group_by(a) %>%
mutate(d = accumulate(b[(2:length(b))-1], `*`,.init=1)) %>%
arrange(a)
# a b d
# <int> <dbl> <dbl>
# 1 1 0.266 1
# 2 1 0.206 0.266
# 3 1 0.935 0.0547
# 4 2 0.372 1
# 5 2 0.177 0.372
# … with 25 more rows
Data
library(tibble)
set.seed(1)
a <- rep(1:10, 3)
b <- runif(30)
c <- tibble(a,b)
Using dplyr, I would do this:
c %>%
mutate(d = 1*accumulate(.x = b[-length(b)],
.init = 1,
.f = `*`))
# # A tibble: 30 x 3
# a b d
# <int> <dbl> <dbl>
# 1 1 0.562 1
# 2 2 0.668 0.562
# 3 3 0.100 0.375
# 4 4 0.242 0.0376
# 5 5 0.0646 0.00907
# 6 6 0.373 0.000586
# 7 7 0.664 0.000219
# 8 8 0.915 0.000145
# 9 9 0.848 0.000133
# 10 10 0.952 0.000113
# # ... with 20 more rows

Dplyr filter top and bottom rows by value simultaneously on grouped data

# Sample dataframe
set.seed(123)
d = data.frame(x = runif(120), grp = gl(3, 40))
# Select top_n
d %>%
group_by(grp) %>%
top_n(n=3, wt=x)
How do I select both top and bottom observations within the same pipe?
Have tried the following but does not work
# helper function
my_top_bott = function(x, n, wt) {
x1 = x %>% top_n(n=n, wt=wt)
x2 = x %>% top_n(n=n, wt=-wt)
x = bind_rows(x1, x2)
return(x)
}
# Pipe
d %>%
group_by(grp) %>%
my_top_bott(., n=3, wt=x)
One possibility could be:
d %>%
group_by(grp) %>%
filter(dense_rank(x) <= 3 | dense_rank(desc(x)) <= 3)
x grp
<dbl> <fct>
1 0.0456 1
2 0.957 1
3 0.0421 1
4 0.994 1
5 0.963 1
6 0.0246 1
7 0.858 2
8 0.0458 2
9 0.895 2
10 0.0948 2
11 0.815 2
12 0.000625 2
13 0.103 3
14 0.985 3
15 0.0936 3
16 0.954 3
17 0.0607 3
18 0.954 3
Or a possibility proposed by #IceCreamToucan:
d %>%
group_by(grp) %>%
filter(!between(dense_rank(x), 3 + 1, n() - 3))
Or a possibility involving match():
d %>%
group_by(grp) %>%
filter(!is.na(x[match(x, sort(x)[c(1:3, (n()-2):n())])]))
You could also use the row_number().
d %>%
group_by(grp) %>%
arrange(desc(x)) %>%
filter(row_number() > max(row_number()) - 3 | row_number() <= 3)
x grp
<dbl> <fct>
1 0.995 2
2 0.975 2
3 0.975 1
4 0.974 3
5 0.974 3
6 0.960 1
7 0.960 3
8 0.951 2
9 0.874 1
10 0.127 2
11 0.104 2
12 0.0693 1
13 0.0520 1
14 0.0279 2
15 0.0146 3
16 0.0114 3
17 0.00864 1
18 0.00333 3
Still in the tidyverse, top_n is superceeded by the ranking functions. One possibility would be to use slice.
d %>%
group_by(grp) %>%
arrange(x) %>%
slice(1:3, (n()-2):n(), with_ties = FALSE)
Notice that the sequence is n()-2:n() and that the paranthesis before the start of the sequence is important.

bootstrap by group in tibble

Suppose I have a tibble tbl_
tbl_ <- tibble(id = c(1,1,2,2,3,3), dta = 1:6)
tbl_
# A tibble: 6 x 2
id dta
<dbl> <int>
1 1 1
2 1 2
3 2 3
4 2 4
5 3 5
6 3 6
There are 3 id groups. I want to resample entire id groups 3 times with replacement. For example the resulting tibble can be:
id dta
<dbl> <int>
1 1 1
2 1 2
3 1 1
4 1 2
5 3 5
6 3 6
but not
id dta
<dbl> <int>
1 1 1
2 1 2
3 1 1
4 2 4
5 3 5
6 3 6
or
id dta
<dbl> <int>
1 1 1
2 1 1
3 2 3
4 2 4
5 3 5
6 3 6
Here is one option with sample_n and distinct
library(tidyverse)
distinct(tbl_, id) %>%
sample_n(nrow(.), replace = TRUE) %>%
pull(id) %>%
map_df( ~ tbl_ %>%
filter(id == .x)) %>%
arrange(id)
# A tibble: 6 x 2
# id dta
# <dbl> <int>
#1 1.00 1
#2 1.00 2
#3 1.00 1
#4 1.00 2
#5 3.00 5
#6 3.00 6
An option can be to get the minimum row number for each id. That row number will be used to generate random samples from wiht replace = TRUE.
library(dplyr)
tbl_ %>% mutate(rn = row_number()) %>%
group_by(id) %>%
summarise(minrow = min(rn)) ->min_row
indx <- rep(sample(min_row$minrow, nrow(min_row), replace = TRUE), each = 2) +
rep(c(0,1), 3)
tbl_[indx,]
# # A tibble: 6 x 2
# id dta
# <dbl> <int>
# 1 1.00 1
# 2 1.00 2
# 3 3.00 5
# 4 3.00 6
# 5 2.00 3
# 6 2.00 4
Note: In the above answer the number of rows for each id has been assumed as 2 but this answer can tackle any number of IDs. The hard-coded each=2 and c(0,1) needs to be modified in order to scale it up to handle more than 2 rows for each id

Resources