This has to be simple in base R, but it is driving me crazy with dplyr (which overall has made my life much better!).
Suppose you have the following tibbles
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 <- tibble(x=seq(5)*19, a1=seq(5)*1, a2=seq(5)*2, a3=seq(5)*4)
df1
#> # A tibble: 5 x 4
#> x a1 a2 a3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 19 1 2 4
#> 2 38 2 4 8
#> 3 57 3 6 12
#> 4 76 4 8 16
#> 5 95 5 10 20
df2 <- tibble(b1=3, b2=0.5, b3=10)
df2
#> # A tibble: 1 x 3
#> b1 b2 b3
#> <dbl> <dbl> <dbl>
#> 1 3 0.5 10
Created on 2020-06-11 by the reprex package (v0.3.0)
Then I simply want to replace in df1 a1 with a1/b1, a2 with a2/b2 and so on.
This has to be general enough to handle the case when I have many columns.
Any suggestion is appreciated.
You can use Map
df1[-1] <- Map(`/`, df1[-1], df2)
# A tibble: 5 x 4
# x a1 a2 a3
# <dbl> <dbl> <dbl> <dbl>
#1 19 0.333 4 0.4
#2 38 0.667 8 0.8
#3 57 1 12 1.2
#4 76 1.33 16 1.6
#5 95 1.67 20 2
Or if you want a tidyverse solution you can use map2 in purrr :
df1[-1] <- purrr::map2(df1[-1], df2, `/`)
You can use rowwise() with c_across()
df1 %>%
rowwise() %>%
mutate(c_across(a1:a3) / df2, .keep = "unused") %>%
ungroup()
# # A tibble: 5 x 4
# x b1 b2 b3
# <dbl> <dbl> <dbl> <dbl>
# 1 19 0.333 4 0.4
# 2 38 0.667 8 0.8
# 3 57 1 12 1.2
# 4 76 1.33 16 1.6
# 5 95 1.67 20 2
Another base R option
df1[-1] <- t(t(df1[-1]) / unlist(df2))
df1
# # A tibble: 5 x 4
# x a1 a2 a3
# <dbl> <dbl> <dbl> <dbl>
# 1 19 0.333 4 0.4
# 2 38 0.667 8 0.8
# 3 57 1 12 1.2
# 4 76 1.33 16 1.6
# 5 95 1.67 20 2
One solution could be:
bind_cols(select(df1, x),
sweep(select(df1, -x), 2, FUN = `/`, unlist(df2)))
x a1 a2 a3
<dbl> <dbl> <dbl> <dbl>
1 19 0.333 4 0.4
2 38 0.667 8 0.8
3 57 1 12 1.2
4 76 1.33 16 1.6
5 95 1.67 20 2
Or like this if you have more columns:
df1[,2:4] <- df1[,2:4] / df2 %>% slice(rep(1:n(), each = nrow(df1)))
# A tibble: 5 x 4
x a1 a2 a3
<dbl> <dbl> <dbl> <dbl>
1 19 0.111 8 0.04
2 38 0.222 16 0.08
3 57 0.333 24 0.12
4 76 0.444 32 0.16
5 95 0.556 40 0.2
Another option which considers the column names of your variables and pairs them with the number they have to be divided for. The function cur_column() comes in handy inside the mutate(across()) - the function you wanted to use
# vector of divisors
l <- as.list(as.numeric(df2[1,]))
df1 %>%
mutate(across(starts_with("a"),
~ ./l[[na.omit(as.numeric(unlist(strsplit(cur_column(), "[^[:digit:]]"))))]]))
Output
# A tibble: 5 x 4
# x a1 a2 a3
# <dbl> <dbl> <dbl> <dbl>
# 1 19 0.333 4 0.4
# 2 38 0.667 8 0.8
# 3 57 1 12 1.2
# 4 76 1.33 16 1.6
# 5 95 1.67 20 2
An option with base R
df1[-1] <- df1[-1]/unlist(df2)[col(df1)]
Related
0
I have longitudinal data of body weights of over 100K participants. Time points of weight measurements between participants are not the same. What I want to know is the average time difference between 1st and 2nd measurement as well as 2nd and 3rd measurement etc. Another one is how many people or % of people who have 3 body weight measurements, as well as for 4,5, 6, 7, and 8 etc. How can I do to find these answers on R.
Perhaps something like this:
library(dplyr, warn.conflicts = F)
set.seed(1)
# generate some sample data
dates <- seq(as.Date("2000-01-01"), by = "day", length.out = 500)
sample_data <- tibble(
participant_id = sample(1:1000, size = 5000, replace = T),
meas_date = sample(dates, size = 5000, replace = T)) %>%
arrange(participant_id, meas_date)
sample_data
#> # A tibble: 5,000 × 2
#> participant_id meas_date
#> <int> <date>
#> 1 1 2000-01-18
#> 2 1 2000-02-28
#> 3 1 2000-05-15
#> 4 1 2001-02-01
#> 5 2 2000-05-11
#> 6 3 2000-01-22
#> 7 3 2000-03-27
#> 8 3 2000-04-17
#> 9 3 2000-09-23
#> 10 3 2000-12-13
#> # … with 4,990 more rows
# periods between each measurement for each participant
meas_periods <- sample_data %>%
group_by(participant_id) %>%
mutate(meas_n = row_number(),
date_diff = meas_date - lag(meas_date)) %>%
ungroup()
meas_periods
#> # A tibble: 5,000 × 4
#> participant_id meas_date meas_n date_diff
#> <int> <date> <int> <drtn>
#> 1 1 2000-01-18 1 NA days
#> 2 1 2000-02-28 2 41 days
#> 3 1 2000-05-15 3 77 days
#> 4 1 2001-02-01 4 262 days
#> 5 2 2000-05-11 1 NA days
#> 6 3 2000-01-22 1 NA days
#> 7 3 2000-03-27 2 65 days
#> 8 3 2000-04-17 3 21 days
#> 9 3 2000-09-23 4 159 days
#> 10 3 2000-12-13 5 81 days
#> # … with 4,990 more rows
# average period between meas_n-1 and meas_n
meas_periods %>%
group_by(meas_n) %>%
summarise(mean_duration = mean(date_diff))
#> # A tibble: 13 × 2
#> meas_n mean_duration
#> <int> <drtn>
#> 1 1 NA days
#> 2 2 88.54102 days
#> 3 3 86.16762 days
#> 4 4 76.21154 days
#> 5 5 69.11392 days
#> 6 6 67.16798 days
#> 7 7 50.67089 days
#> 8 8 50.91111 days
#> 9 9 49.89873 days
#> 10 10 48.70588 days
#> 11 11 51.00000 days
#> 12 12 26.25000 days
#> 13 13 66.00000 days
# number and percentage of participants gone through meas_n measurements
meas_periods %>%
count(meas_n, name = "participant_n") %>%
mutate(percent = participant_n/max(participant_n))
#> # A tibble: 13 × 3
#> meas_n participant_n percent
#> <int> <int> <dbl>
#> 1 1 996 1
#> 2 2 963 0.967
#> 3 3 877 0.881
#> 4 4 728 0.731
#> 5 5 553 0.555
#> 6 6 381 0.383
#> 7 7 237 0.238
#> 8 8 135 0.136
#> 9 9 79 0.0793
#> 10 10 34 0.0341
#> 11 11 12 0.0120
#> 12 12 4 0.00402
#> 13 13 1 0.00100
Created on 2022-11-02 with reprex v2.0.2
I am trying to assign quantile groups for a stacked data such that for each category (r1 and r2 in my example) of data, I can classify the values into 5 groups. I can manage to do this using ntile() as follows.
r1<-rnorm(10,0,1)
r2<-rnorm(10,2,4)
df<-cbind(r1,r2)
df<-melt(df)
df<-df%>%group_by(Var2) %>% mutate(group=ntile(value,5))
However, what should I do if I hope to exclude the top and bottom 10% when sorting the groups. Ideally, I hope to keep those top and bottom values in the output table with their group code showing as "NA".
Thanks to anyone who can help!
Your question is a little ambiguous. It is not clear whether you wish to exclude the top and bottom 10% from the quintile calculation (so that you are getting equal quintiles of the 10-90th centiles of the original data), or whether you want to do the quintiles first on all the data, then exclude the first and last 10%. Doing it the second way will give you smaller 1st and 5th quintiles, so I assume you mean the first method:
df %>%
group_by(Var2) %>%
mutate(group = ntile(value, 10)) %>%
mutate(group = ntile(ifelse(group %% 9 == 1, NA, value), 5))
#> # A tibble: 20 x 4
#> # Groups: Var2 [2]
#> Var1 Var2 value group
#> <int> <fct> <dbl> <int>
#> 1 1 r1 -0.626 1
#> 2 2 r1 0.184 2
#> 3 3 r1 -0.836 NA
#> 4 4 r1 1.60 NA
#> 5 5 r1 0.330 3
#> 6 6 r1 -0.820 1
#> 7 7 r1 0.487 3
#> 8 8 r1 0.738 5
#> 9 9 r1 0.576 4
#> 10 10 r1 -0.305 2
#> 11 1 r2 8.05 NA
#> 12 2 r2 3.56 2
#> 13 3 r2 -0.485 1
#> 14 4 r2 -6.86 NA
#> 15 5 r2 6.50 5
#> 16 6 r2 1.82 1
#> 17 7 r2 1.94 2
#> 18 8 r2 5.78 4
#> 19 9 r2 5.28 3
#> 20 10 r2 4.38 3
Just in case, the second method you would achieve like this:
df %>%
group_by(Var2) %>%
mutate(group = ntile(value, 5)) %>%
mutate(group = ifelse(ntile(value, 10) %% 9 == 1, NA, group))
#> # A tibble: 20 x 4
#> # Groups: Var2 [2]
#> Var1 Var2 value group
#> <int> <fct> <dbl> <int>
#> 1 1 r1 -0.626 2
#> 2 2 r1 0.184 3
#> 3 3 r1 -0.836 NA
#> 4 4 r1 1.60 NA
#> 5 5 r1 0.330 3
#> 6 6 r1 -0.820 1
#> 7 7 r1 0.487 4
#> 8 8 r1 0.738 5
#> 9 9 r1 0.576 4
#> 10 10 r1 -0.305 2
#> 11 1 r2 8.05 NA
#> 12 2 r2 3.56 3
#> 13 3 r2 -0.485 1
#> 14 4 r2 -6.86 NA
#> 15 5 r2 6.50 5
#> 16 6 r2 1.82 2
#> 17 7 r2 1.94 2
#> 18 8 r2 5.78 4
#> 19 9 r2 5.28 4
#> 20 10 r2 4.38 3
Created on 2022-02-19 by the reprex package (v2.0.1)
Setup and data used
library(dplyr)
library(reshape2)
set.seed(1)
r1 <- rnorm(10,0,1)
r2 <- rnorm(10,2,4)
df <- cbind(r1,r2)
df <- melt(df)
Given a data.frame:
library(tidyverse)
set.seed(0)
df <- tibble(A = 1:10, B = rnorm(10), C = rbinom(10,2,0.6))
var <- "B"
I'd like to get filter the data frame by the highest values of the variable in var. Logically, I'd do either:
df %>%
slice_max({{ var }}, n = 5)
#> # A tibble: 1 × 3
#> A B C
#> <int> <dbl> <int>
#> 1 1 1.26 1
df %>%
slice_max(!! var, n = 5)
#> # A tibble: 1 × 3
#> A B C
#> <int> <dbl> <int>
#> 1 1 1.26 1
But neither interpolation is working... what am I missing here?
Expected output would be the same as:
df %>%
slice_max(B, n = 5)
#> # A tibble: 5 × 3
#> A B C
#> <int> <dbl> <int>
#> 1 10 2.40 0
#> 2 3 1.33 2
#> 3 4 1.27 1
#> 4 1 1.26 1
#> 5 5 0.415 2
I think you need to use the newer .data version as outlined here:
df %>%
slice_max(.data[[var]] , n = 5)
#> # A tibble: 5 × 3
#> A B C
#> <int> <dbl> <int>
#> 1 10 2.40 0
#> 2 3 1.33 2
#> 3 4 1.27 1
#> 4 1 1.26 1
#> 5 5 0.415 2
I am puzzled by why your approach is get the first row only though!
We may convert to sym and evaluate (!!)
library(dplyr)
df %>%
slice_max(!! rlang::sym(var), n = 5)
-output
# A tibble: 5 × 3
A B C
<int> <dbl> <int>
1 10 2.40 0
2 3 1.33 2
3 4 1.27 1
4 1 1.26 1
5 5 0.415 2
I want to perform several operations on multiple columns and I can use dplyr::across() to do it that way:
library(tidyverse)
df = tibble(x=1:5, p1=x*2, p2=x*4, p3=x*5)
r1 = df %>%
mutate(across(starts_with("p"), c(inf=~.x-1, sup=~.x+1)))
r1
#> # A tibble: 5 x 10
#> x p1 p2 p3 p1_inf p1_sup p2_inf p2_sup p3_inf p3_sup
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 5 1 3 3 5 4 6
#> 2 2 4 8 10 3 5 7 9 9 11
#> 3 3 6 12 15 5 7 11 13 14 16
#> 4 4 8 16 20 7 9 15 17 19 21
#> 5 5 10 20 25 9 11 19 21 24 26
names(r1)
#> [1] "x" "p1" "p2" "p3" "p1_inf" "p1_sup" "p2_inf" "p2_sup"
#> [9] "p3_inf" "p3_sup"
However, this is not very scalable if the function calculates a lot of things as it would be evaluated twice.
Instead, it would be nice if I could use a function that calculates the things that need to be calculated, and then returns a list of the 2 (or more) results.
For instance, consider this example:
#perform heavy calculation on x2 and return 2 flavours of it
f = function(x) {
x2=x #wow, such heavy, very calculate
Sys.sleep(1)
data.frame(inf=x2-10, sup=x2+10)
}
r2 = df %>%
mutate(across(starts_with("p"), f, .names="{.col}_{.fn}"))
r2
#> # A tibble: 5 x 7
#> x p1 p2 p3 p1_1$inf $sup p2_1$inf $sup p3_1$inf $sup
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 5 -8 12 -6 14 -5 15
#> 2 2 4 8 10 -6 14 -2 18 0 20
#> 3 3 6 12 15 -4 16 2 22 5 25
#> 4 4 8 16 20 -2 18 6 26 10 30
#> 5 5 10 20 25 0 20 10 30 15 35
names(r2)
#> [1] "x" "p1" "p2" "p3" "p1_1" "p2_1" "p3_1"
map_chr(r2, class)
#> x p1 p2 p3 p1_1 p2_1
#> "integer" "numeric" "numeric" "numeric" "data.frame" "data.frame"
#> p3_1
#> "data.frame"
Created on 2021-10-25 by the reprex package (v2.0.1)
Using rbind() instead of data.frame() would end in the same result with slightly different names (p1_1$inf becomes p1_1[,"inf"]) and a different class (data.frame becomes c("matrix", "array").
Moreover, when using a single function, {.fn} is the position of the function so there are naming issues.
I also tried to unnest() the result but with no success.
Is there a way to get the exact result of my first output using a function in across()?
Maybe this will help you ?
library(tidyverse)
f = function(x, y) {
x2=x
tibble(!!paste0(y, '_inf') := x2-10,
!!paste0(y, '_sup') := x2+10)
}
imap_dfc(select(df, starts_with('p')), f)
# p1_inf p1_sup p2_inf p2_sup p3_inf p3_sup
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 -8 12 -6 14 -5 15
#2 -6 14 -2 18 0 20
#3 -4 16 2 22 5 25
#4 -2 18 6 26 10 30
#5 0 20 10 30 15 35
Bind to orignal df.
bind_cols(df %>% select(-starts_with('p')),
imap_dfc(select(df, starts_with('p')), f))
Actually, as you have completed your heavy calculations, resulting in a nested dataframe, so, all you need is to convert it to a flat form, maybe some mutate()s + rename could help?
r2 <- df %>%
mutate(across(2:4, f, .names="{.col}_{.fn}")) %>%
mutate(across(5:7, .names = ("{.col}_inf"), .fn = ~ .x[,1] ) ) %>%
mutate(across(5:7, .names = ("{.col}_sup"), .fn = ~ .x[,2] ) ) %>%
rename_with(.fn = ~ gsub("_1_", "_", .x)) %>%
select(-contains("_1"))
> r2
# A tibble: 5 x 10
x p1 p2 p3 p1_inf p2_inf p3_inf p1_sup p2_sup p3_sup
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 4 5 -8 -6 -5 12 14 15
2 2 4 8 10 -6 -2 0 14 18 20
3 3 6 12 15 -4 2 5 16 22 25
4 4 8 16 20 -2 6 10 18 26 30
5 5 10 20 25 0 10 15 20 30 35
Actually, this has been considered in an issue on the Github of dplyr: https://github.com/tidyverse/dplyr/issues/5563#issuecomment-721769342.
There, #romainfrancois gives an incredibly useful solution as this unpackross() function:
library(tidyverse)
f = function(x) tibble(inf=x-10, sup=x+10)
unpackross = function(...) {
out = across(...)
tidyr::unpack(out, names(out), names_sep = "_")
}
df = tibble(x=1:5, p1=x*2, p2=x*4, p3=x*5)
r2 = df %>%
mutate(unpackross(starts_with("p"), f, .names="{.col}_{.fn}"))
r2
#> # A tibble: 5 x 10
#> x p1 p2 p3 p1_1_inf p1_1_sup p2_1_inf p2_1_sup p3_1_inf p3_1_sup
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 5 -8 12 -6 14 -5 15
#> 2 2 4 8 10 -6 14 -2 18 0 20
#> 3 3 6 12 15 -4 16 2 22 5 25
#> 4 4 8 16 20 -2 18 6 26 10 30
#> 5 5 10 20 25 0 20 10 30 15 35
names(r2)
#> [1] "x" "p1" "p2" "p3" "p1_1_inf" "p1_1_sup"
#> [7] "p2_1_inf" "p2_1_sup" "p3_1_inf" "p3_1_sup"
map_chr(r2, class)
#> x p1 p2 p3 p1_1_inf p1_1_sup p2_1_inf p2_1_sup
#> "integer" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
#> p3_1_inf p3_1_sup
#> "numeric" "numeric"
Created on 2021-10-26 by the reprex package (v2.0.1)
Hopefully, there will be an unpack parameter in across() one day! (add a +1 on my suggestion here if you agree)
I came across a problem that forced me to use a loop instead of my preferred dplyr pipe flow.
I want to group rows based on consecutive observations of the same value.
For example, if the first four observations of type equal a, the first four observations should assigned to the same group. Order matters, so I can't dplyr::group_by and dplyr::summarize.
The code below should explain the problem fairly well. I was wondering if anyone could propose a less verbose way to do this, preferably using tidyverse packages, and not data.tables.
library(tidyverse)
# Crete some test data
df <- tibble(
id = 1:20,
type = c(rep("a", 5), rep("b", 5), rep("a", 5), rep("b", 5)),
val = runif(20)
)
df
#> # A tibble: 20 x 3
#> id type val
#> <int> <chr> <dbl>
#> 1 1 a 0.0606
#> 2 2 a 0.501
#> 3 3 a 0.974
#> 4 4 a 0.0833
#> 5 5 a 0.752
#> 6 6 b 0.0450
#> 7 7 b 0.367
#> 8 8 b 0.649
#> 9 9 b 0.846
#> 10 10 b 0.896
#> 11 11 a 0.178
#> 12 12 a 0.295
#> 13 13 a 0.206
#> 14 14 a 0.233
#> 15 15 a 0.851
#> 16 16 b 0.179
#> 17 17 b 0.801
#> 18 18 b 0.326
#> 19 19 b 0.269
#> 20 20 b 0.584
# Solve problem with a loop
count <- 1
df$consec_group <- NA
for (i in 1:nrow(df)) {
current <- df$type[i]
lag <- ifelse(i == 1, NA, df$type[i - 1])
lead <- ifelse(i == nrow(df), NA, df$type[i + 1])
if (lead %>% is.na) {
df$consec_group[i] <- ifelse(current == lag, count, count + 1)
} else {
df$consec_group[i] <- count
if (current != lead) count <- count + 1
}
}
df
#> # A tibble: 20 x 4
#> id type val consec_group
#> <int> <chr> <dbl> <dbl>
#> 1 1 a 0.0606 1
#> 2 2 a 0.501 1
#> 3 3 a 0.974 1
#> 4 4 a 0.0833 1
#> 5 5 a 0.752 1
#> 6 6 b 0.0450 2
#> 7 7 b 0.367 2
#> 8 8 b 0.649 2
#> 9 9 b 0.846 2
#> 10 10 b 0.896 2
#> 11 11 a 0.178 3
#> 12 12 a 0.295 3
#> 13 13 a 0.206 3
#> 14 14 a 0.233 3
#> 15 15 a 0.851 3
#> 16 16 b 0.179 4
#> 17 17 b 0.801 4
#> 18 18 b 0.326 4
#> 19 19 b 0.269 4
#> 20 20 b 0.584 4
Created on 2019-03-14 by the reprex package (v0.2.1)
This grouping of consecutive type occurrences is really just an intermediate step. My endgame is manipulate val for a given consec_group, based on the values of val that occurred within the previous consec_group. Advice on relevant packages would be appreciated.
You say "no data.tables", but are you sure? It's so *** fast and easy (in this case)...
library(data.table)
setDT(df)[, groupid := rleid(type)][]
# id type val groupid
# 1: 1 a 0.624078793 1
# 2: 2 a 0.687361541 1
# 3: 3 a 0.817702740 1
# 4: 4 a 0.669857208 1
# 5: 5 a 0.100977936 1
# 6: 6 b 0.418275823 2
# 7: 7 b 0.660119857 2
# 8: 8 b 0.876015209 2
# 9: 9 b 0.473562143 2
# 10: 10 b 0.284474633 2
# 11: 11 a 0.034154862 3
# 12: 12 a 0.391760387 3
# 13: 13 a 0.383107868 3
# 14: 14 a 0.729583433 3
# 15: 15 a 0.006288375 3
# 16: 16 b 0.530179235 4
# 17: 17 b 0.802643704 4
# 18: 18 b 0.409618633 4
# 19: 19 b 0.309363642 4
# 20: 20 b 0.021918512 4
If you insist on using the tidyverse/dplyr, you can (of course) still use the
rleid-function as follows:
df %>% mutate( groupid = data.table::rleid(type) )
benchmarks
on a larger sample
library(tidyverse)
library(data.table)
# Crete some large test data
df <- tibble(
id = 1:200000,
type = sample(letters[1:26], 200000, replace = TRUE),
val = runif(200000)
)
dt <- as.data.table(df)
microbenchmark::microbenchmark(
dplyr.rleid = df %>% mutate( groupid = data.table::rleid(type) ),
data.table.rleid = dt[, groupid := rleid(type)][],
rle = df %>% mutate(ID_rleid = {ID_rleid = rle(type); rep(seq_along(ID_rleid$lengths), ID_rleid$lengths)}),
rle2 = df %>% mutate(ID_rleid = with(rle(type), rep(seq_along(lengths), lengths))),
transform = transform(df, ID = with(rle(df$type), rep(seq_along(lengths), lengths))),
times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# dplyr.rleid 3.153626 3.278049 3.410363 3.444949 3.502792 3.582626 10
# data.table.rleid 2.965639 3.065959 3.173992 3.145643 3.259672 3.507009 10
# rle 13.059774 14.042797 24.364176 26.126176 29.460561 36.874054 10
# rle2 12.641319 13.553846 30.951152 24.698338 34.139786 102.791719 10
# transform 12.330717 22.419128 22.725242 25.532084 26.187634 26.702794 10
You can use a rleid()-like possibility like this:
df %>%
mutate(ID_rleid = {ID_rleid = rle(type); rep(seq_along(ID_rleid$lengths), ID_rleid$lengths)})
id type val ID_rleid
<int> <chr> <dbl> <int>
1 1 a 0.0430 1
2 2 a 0.858 1
3 3 a 0.504 1
4 4 a 0.318 1
5 5 a 0.469 1
6 6 b 0.144 2
7 7 b 0.173 2
8 8 b 0.0706 2
9 9 b 0.958 2
10 10 b 0.557 2
11 11 a 0.358 3
12 12 a 0.973 3
13 13 a 0.982 3
14 14 a 0.177 3
15 15 a 0.599 3
16 16 b 0.627 4
17 17 b 0.454 4
18 18 b 0.682 4
19 19 b 0.690 4
20 20 b 0.713 4
Or a modification (originally proposed by #d.b) that makes it more handy:
df %>%
mutate(ID_rleid = with(rle(type), rep(seq_along(lengths), lengths)))