Excluding outliers in ntile() - r

I am trying to assign quantile groups for a stacked data such that for each category (r1 and r2 in my example) of data, I can classify the values into 5 groups. I can manage to do this using ntile() as follows.
r1<-rnorm(10,0,1)
r2<-rnorm(10,2,4)
df<-cbind(r1,r2)
df<-melt(df)
df<-df%>%group_by(Var2) %>% mutate(group=ntile(value,5))
However, what should I do if I hope to exclude the top and bottom 10% when sorting the groups. Ideally, I hope to keep those top and bottom values in the output table with their group code showing as "NA".
Thanks to anyone who can help!

Your question is a little ambiguous. It is not clear whether you wish to exclude the top and bottom 10% from the quintile calculation (so that you are getting equal quintiles of the 10-90th centiles of the original data), or whether you want to do the quintiles first on all the data, then exclude the first and last 10%. Doing it the second way will give you smaller 1st and 5th quintiles, so I assume you mean the first method:
df %>%
group_by(Var2) %>%
mutate(group = ntile(value, 10)) %>%
mutate(group = ntile(ifelse(group %% 9 == 1, NA, value), 5))
#> # A tibble: 20 x 4
#> # Groups: Var2 [2]
#> Var1 Var2 value group
#> <int> <fct> <dbl> <int>
#> 1 1 r1 -0.626 1
#> 2 2 r1 0.184 2
#> 3 3 r1 -0.836 NA
#> 4 4 r1 1.60 NA
#> 5 5 r1 0.330 3
#> 6 6 r1 -0.820 1
#> 7 7 r1 0.487 3
#> 8 8 r1 0.738 5
#> 9 9 r1 0.576 4
#> 10 10 r1 -0.305 2
#> 11 1 r2 8.05 NA
#> 12 2 r2 3.56 2
#> 13 3 r2 -0.485 1
#> 14 4 r2 -6.86 NA
#> 15 5 r2 6.50 5
#> 16 6 r2 1.82 1
#> 17 7 r2 1.94 2
#> 18 8 r2 5.78 4
#> 19 9 r2 5.28 3
#> 20 10 r2 4.38 3
Just in case, the second method you would achieve like this:
df %>%
group_by(Var2) %>%
mutate(group = ntile(value, 5)) %>%
mutate(group = ifelse(ntile(value, 10) %% 9 == 1, NA, group))
#> # A tibble: 20 x 4
#> # Groups: Var2 [2]
#> Var1 Var2 value group
#> <int> <fct> <dbl> <int>
#> 1 1 r1 -0.626 2
#> 2 2 r1 0.184 3
#> 3 3 r1 -0.836 NA
#> 4 4 r1 1.60 NA
#> 5 5 r1 0.330 3
#> 6 6 r1 -0.820 1
#> 7 7 r1 0.487 4
#> 8 8 r1 0.738 5
#> 9 9 r1 0.576 4
#> 10 10 r1 -0.305 2
#> 11 1 r2 8.05 NA
#> 12 2 r2 3.56 3
#> 13 3 r2 -0.485 1
#> 14 4 r2 -6.86 NA
#> 15 5 r2 6.50 5
#> 16 6 r2 1.82 2
#> 17 7 r2 1.94 2
#> 18 8 r2 5.78 4
#> 19 9 r2 5.28 4
#> 20 10 r2 4.38 3
Created on 2022-02-19 by the reprex package (v2.0.1)
Setup and data used
library(dplyr)
library(reshape2)
set.seed(1)
r1 <- rnorm(10,0,1)
r2 <- rnorm(10,2,4)
df <- cbind(r1,r2)
df <- melt(df)

Related

Flag run-length of grouped intervals

I have a dataframe grouped by grp:
df <- data.frame(
v = rnorm(25),
grp = c(rep("A",10), rep("B",15)),
size = 2)
I want to flag the run-length of intervals determined by size. For example, for grp == "A", size is 2, and the number of rows is 10. So the interval should have length 10/2 = 5. This code, however, creates intervals with length 2:
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% size)
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 1
4 -0.913 A 2 1
5 0.486 A 2 2
6 -1.80 A 2 2
7 -0.370 A 2 3
8 -0.209 A 2 3
9 -0.661 A 2 4
10 -0.177 A 2 4
# … with 15 more rows
How can I flag the correct run-length of the size-determined intervals? The desired output is this:
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 0
4 -0.913 A 2 0
5 0.486 A 2 0
6 -1.80 A 2 1
7 -0.370 A 2 1
8 -0.209 A 2 1
9 -0.661 A 2 1
10 -0.177 A 2 1
# … with 15 more rows
If I interpreted your question correctly, this small change should do the trick?
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% (n()/size))
You can use gl:
df %>%
group_by(grp) %>%
mutate(interval = gl(first(size), ceiling(n() / first(size)))[1:n()])
output
# A tibble: 26 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <fct>
1 -1.12 A 2 1
2 3.04 A 2 1
3 0.235 A 2 1
4 -0.0333 A 2 1
5 -2.73 A 2 1
6 -0.0998 A 2 1
7 0.976 A 2 2
8 0.414 A 2 2
9 0.912 A 2 2
10 1.98 A 2 2
11 1.17 A 2 2
12 -0.509 B 2 1
13 0.704 B 2 1
14 -0.198 B 2 1
15 -0.538 B 2 1
16 -2.86 B 2 1
17 -0.790 B 2 1
18 0.488 B 2 1
19 2.17 B 2 1
20 0.501 B 2 2
21 0.620 B 2 2
22 -0.966 B 2 2
23 0.163 B 2 2
24 -2.08 B 2 2
25 0.485 B 2 2
26 0.697 B 2 2

bind_rows of multiple tibbles in 2 named lists using imap: how to subset properly

This post seems relevant/similar but not quite the same issue: `dplyr::bind_rows` not working while combining listed tibbles
I'm trying to bind the rows of multiple tibbles in 2 named lists. I've been using imap to iterate across list elements because it retains the names (map doesn't it seems). However, bind_rows doesn't seem to work here for some reason. It creates weird column names. It only seems to work with an unnamed list, but then I lose the names.
library(tidyverse)
# named lists
list1 <- list(
tibble(a = 1:4, b = LETTERS[1:4]),
tibble(a = 5:8, b = LETTERS[5:8]),
tibble(a = 9:12, b = LETTERS[9:12])
) %>% set_names(c("X","Y","Z"))
list2 <- list(
tibble(a = 3:6, b = LETTERS[3:6], c = 3:6 / 2),
tibble(a = 7:10, b = LETTERS[7:10], c = 7:10 / 2),
tibble(a = 11:14, b = LETTERS[11:14], c = 11:14 / 2)
) %>% set_names(c("X","Y","Z"))
# subsetting a named lists with bind_rows creates weird column names
list1 %>% imap(~ {
.x %>% bind_rows(list2[.y])
})
#> $X
#> # A tibble: 8 × 3
#> a b X$a $b $c
#> <int> <chr> <int> <chr> <dbl>
#> 1 1 A NA NA NA
#> 2 2 B NA NA NA
#> 3 3 C NA NA NA
#> 4 4 D NA NA NA
#> 5 NA NA 3 C 1.5
#> 6 NA NA 4 D 2
#> 7 NA NA 5 E 2.5
#> 8 NA NA 6 F 3
#>
#> $Y
#> # A tibble: 8 × 3
#> a b Y$a $b $c
#> <int> <chr> <int> <chr> <dbl>
#> 1 5 E NA NA NA
#> 2 6 F NA NA NA
#> 3 7 G NA NA NA
#> 4 8 H NA NA NA
#> 5 NA NA 7 G 3.5
#> 6 NA NA 8 H 4
#> 7 NA NA 9 I 4.5
#> 8 NA NA 10 J 5
#>
#> $Z
#> # A tibble: 8 × 3
#> a b Z$a $b $c
#> <int> <chr> <int> <chr> <dbl>
#> 1 9 I NA NA NA
#> 2 10 J NA NA NA
#> 3 11 K NA NA NA
#> 4 12 L NA NA NA
#> 5 NA NA 11 K 5.5
#> 6 NA NA 12 L 6
#> 7 NA NA 13 M 6.5
#> 8 NA NA 14 N 7
# subsetting unnamed lists works but loses names
list1 -> list1 %>% unname()
list2 -> list2 %>% unname()
list1 %>% imap(~ {
.x %>% bind_rows(list2[.y])
})
#> [[1]]
#> # A tibble: 8 × 3
#> a b c
#> <int> <chr> <dbl>
#> 1 1 A NA
#> 2 2 B NA
#> 3 3 C NA
#> 4 4 D NA
#> 5 3 C 1.5
#> 6 4 D 2
#> 7 5 E 2.5
#> 8 6 F 3
#>
#> [[2]]
#> # A tibble: 8 × 3
#> a b c
#> <int> <chr> <dbl>
#> 1 5 E NA
#> 2 6 F NA
#> 3 7 G NA
#> 4 8 H NA
#> 5 7 G 3.5
#> 6 8 H 4
#> 7 9 I 4.5
#> 8 10 J 5
#>
#> [[3]]
#> # A tibble: 8 × 3
#> a b c
#> <int> <chr> <dbl>
#> 1 9 I NA
#> 2 10 J NA
#> 3 11 K NA
#> 4 12 L NA
#> 5 11 K 5.5
#> 6 12 L 6
#> 7 13 M 6.5
#> 8 14 N 7
You should use double brackets [[ to extract the element of a list. What you subset by a single bracket [ from a list is still a list. You could check the difference between list2["X"] and list2[["X"]].
list1 %>% imap(~ {
.x %>% bind_rows(list2[[.y]])
})
Output
$X
# A tibble: 8 × 3
a b c
<int> <chr> <dbl>
1 1 A NA
2 2 B NA
3 3 C NA
4 4 D NA
5 3 C 1.5
6 4 D 2
7 5 E 2.5
8 6 F 3
$Y
# A tibble: 8 × 3
a b c
<int> <chr> <dbl>
1 5 E NA
2 6 F NA
3 7 G NA
4 8 H NA
5 7 G 3.5
6 8 H 4
7 9 I 4.5
8 10 J 5
$Z
# A tibble: 8 × 3
a b c
<int> <chr> <dbl>
1 9 I NA
2 10 J NA
3 11 K NA
4 12 L NA
5 11 K 5.5
6 12 L 6
7 13 M 6.5
8 14 N 7
Another possible solution, based on purrr::map2:
library(tidyverse)
map2(list1, list2, bind_rows)
#> $X
#> # A tibble: 8 x 3
#> a b c
#> <int> <chr> <dbl>
#> 1 1 A NA
#> 2 2 B NA
#> 3 3 C NA
#> 4 4 D NA
#> 5 3 C 1.5
#> 6 4 D 2
#> 7 5 E 2.5
#> 8 6 F 3
#>
#> $Y
#> # A tibble: 8 x 3
#> a b c
#> <int> <chr> <dbl>
#> 1 5 E NA
#> 2 6 F NA
#> 3 7 G NA
#> 4 8 H NA
#> 5 7 G 3.5
#> 6 8 H 4
#> 7 9 I 4.5
#> 8 10 J 5
#>
#> $Z
#> # A tibble: 8 x 3
#> a b c
#> <int> <chr> <dbl>
#> 1 9 I NA
#> 2 10 J NA
#> 3 11 K NA
#> 4 12 L NA
#> 5 11 K 5.5
#> 6 12 L 6
#> 7 13 M 6.5
#> 8 14 N 7

How can i add a row vector in a tibble in R?

I have a tibble in R that has 11 observations of every month.Apart from June that has 0.
My data frame (tibble) looks like this :
library(tidyverse)
A = c(1,2,3,4,5,7,8,9,10,11,12)
B = rnorm(11,0,1)
Data = tibble(A,B);Data
But i want to add the 0 observation of June of this timeseries.
Something like :
d = c(6,0);d
newdata = rbind(Data,d)
order(newdata$A)
but the 12 (december) appears.Any help?
Two approaches:
(1) We can use add_row for this. However, d must be named and we need to splice it into add_row with the tribble bang !!! operator. Then we can arrange the data so that the month are sorted from 1 to 12. Of course you can specify add_row directly like in #Chris answer without the need of an external vector.
library(dplyr)
A = c(1,2,3,4,5,7,8,9,10,11,12)
B = rnorm(11,0,1)
Data = tibble(A,B)
d = c(A = 6, B = 0)
newdata <- Data %>%
add_row(!!! d) %>%
arrange(A)
# check
newdata
#> # A tibble: 12 x 2
#> A B
#> <dbl> <dbl>
#> 1 1 1.22
#> 2 2 0.0729
#> 3 3 0.597
#> 4 4 -1.26
#> 5 5 0.928
#> 6 6 0
#> 7 7 -1.08
#> 8 8 0.704
#> 9 9 -0.119
#> 10 10 -0.462
#> 11 11 -0.00388
#> 12 12 1.56
order(newdata$A)
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12
(2) We can use tidyr::complete, as suggested by #Ronak in the comments, although we use a slightly different specification with full_seq:
library(tidyr)
Data %>%
complete(A = full_seq(A, 1), fill = list(B = 0))
#> # A tibble: 12 x 2
#> A B
#> <dbl> <dbl>
#> 1 1 -0.258
#> 2 2 -1.18
#> 3 3 -0.165
#> 4 4 0.775
#> 5 5 0.926
#> 6 6 0
#> 7 7 0.343
#> 8 8 1.10
#> 9 9 0.359
#> 10 10 0.934
#> 11 11 -0.444
#> 12 12 0.184
Created on 2021-09-21 by the reprex package (v2.0.1)
You can define the additional row in add_row:
library(dplyr)
Data %>%
add_row(A = 6, B = 0) %>%
arrange(A)
# A tibble: 12 x 2
A B
<dbl> <dbl>
1 1 -0.547
2 2 -0.564
3 3 -0.890
4 4 -0.477
5 5 -0.998
6 6 0
7 7 -0.776
8 8 0.0645
9 9 0.959
10 10 -0.110
11 11 -0.511
12 12 -0.911

Dividing selected columns by vector in dplyr

This has to be simple in base R, but it is driving me crazy with dplyr (which overall has made my life much better!).
Suppose you have the following tibbles
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 <- tibble(x=seq(5)*19, a1=seq(5)*1, a2=seq(5)*2, a3=seq(5)*4)
df1
#> # A tibble: 5 x 4
#> x a1 a2 a3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 19 1 2 4
#> 2 38 2 4 8
#> 3 57 3 6 12
#> 4 76 4 8 16
#> 5 95 5 10 20
df2 <- tibble(b1=3, b2=0.5, b3=10)
df2
#> # A tibble: 1 x 3
#> b1 b2 b3
#> <dbl> <dbl> <dbl>
#> 1 3 0.5 10
Created on 2020-06-11 by the reprex package (v0.3.0)
Then I simply want to replace in df1 a1 with a1/b1, a2 with a2/b2 and so on.
This has to be general enough to handle the case when I have many columns.
Any suggestion is appreciated.
You can use Map
df1[-1] <- Map(`/`, df1[-1], df2)
# A tibble: 5 x 4
# x a1 a2 a3
# <dbl> <dbl> <dbl> <dbl>
#1 19 0.333 4 0.4
#2 38 0.667 8 0.8
#3 57 1 12 1.2
#4 76 1.33 16 1.6
#5 95 1.67 20 2
Or if you want a tidyverse solution you can use map2 in purrr :
df1[-1] <- purrr::map2(df1[-1], df2, `/`)
You can use rowwise() with c_across()
df1 %>%
rowwise() %>%
mutate(c_across(a1:a3) / df2, .keep = "unused") %>%
ungroup()
# # A tibble: 5 x 4
# x b1 b2 b3
# <dbl> <dbl> <dbl> <dbl>
# 1 19 0.333 4 0.4
# 2 38 0.667 8 0.8
# 3 57 1 12 1.2
# 4 76 1.33 16 1.6
# 5 95 1.67 20 2
Another base R option
df1[-1] <- t(t(df1[-1]) / unlist(df2))
df1
# # A tibble: 5 x 4
# x a1 a2 a3
# <dbl> <dbl> <dbl> <dbl>
# 1 19 0.333 4 0.4
# 2 38 0.667 8 0.8
# 3 57 1 12 1.2
# 4 76 1.33 16 1.6
# 5 95 1.67 20 2
One solution could be:
bind_cols(select(df1, x),
sweep(select(df1, -x), 2, FUN = `/`, unlist(df2)))
x a1 a2 a3
<dbl> <dbl> <dbl> <dbl>
1 19 0.333 4 0.4
2 38 0.667 8 0.8
3 57 1 12 1.2
4 76 1.33 16 1.6
5 95 1.67 20 2
Or like this if you have more columns:
df1[,2:4] <- df1[,2:4] / df2 %>% slice(rep(1:n(), each = nrow(df1)))
# A tibble: 5 x 4
x a1 a2 a3
<dbl> <dbl> <dbl> <dbl>
1 19 0.111 8 0.04
2 38 0.222 16 0.08
3 57 0.333 24 0.12
4 76 0.444 32 0.16
5 95 0.556 40 0.2
Another option which considers the column names of your variables and pairs them with the number they have to be divided for. The function cur_column() comes in handy inside the mutate(across()) - the function you wanted to use
# vector of divisors
l <- as.list(as.numeric(df2[1,]))
df1 %>%
mutate(across(starts_with("a"),
~ ./l[[na.omit(as.numeric(unlist(strsplit(cur_column(), "[^[:digit:]]"))))]]))
Output
# A tibble: 5 x 4
# x a1 a2 a3
# <dbl> <dbl> <dbl> <dbl>
# 1 19 0.333 4 0.4
# 2 38 0.667 8 0.8
# 3 57 1 12 1.2
# 4 76 1.33 16 1.6
# 5 95 1.67 20 2
An option with base R
df1[-1] <- df1[-1]/unlist(df2)[col(df1)]

Grouping by consecutive value occurrences

I came across a problem that forced me to use a loop instead of my preferred dplyr pipe flow.
I want to group rows based on consecutive observations of the same value.
For example, if the first four observations of type equal a, the first four observations should assigned to the same group. Order matters, so I can't dplyr::group_by and dplyr::summarize.
The code below should explain the problem fairly well. I was wondering if anyone could propose a less verbose way to do this, preferably using tidyverse packages, and not data.tables.
library(tidyverse)
# Crete some test data
df <- tibble(
id = 1:20,
type = c(rep("a", 5), rep("b", 5), rep("a", 5), rep("b", 5)),
val = runif(20)
)
df
#> # A tibble: 20 x 3
#> id type val
#> <int> <chr> <dbl>
#> 1 1 a 0.0606
#> 2 2 a 0.501
#> 3 3 a 0.974
#> 4 4 a 0.0833
#> 5 5 a 0.752
#> 6 6 b 0.0450
#> 7 7 b 0.367
#> 8 8 b 0.649
#> 9 9 b 0.846
#> 10 10 b 0.896
#> 11 11 a 0.178
#> 12 12 a 0.295
#> 13 13 a 0.206
#> 14 14 a 0.233
#> 15 15 a 0.851
#> 16 16 b 0.179
#> 17 17 b 0.801
#> 18 18 b 0.326
#> 19 19 b 0.269
#> 20 20 b 0.584
# Solve problem with a loop
count <- 1
df$consec_group <- NA
for (i in 1:nrow(df)) {
current <- df$type[i]
lag <- ifelse(i == 1, NA, df$type[i - 1])
lead <- ifelse(i == nrow(df), NA, df$type[i + 1])
if (lead %>% is.na) {
df$consec_group[i] <- ifelse(current == lag, count, count + 1)
} else {
df$consec_group[i] <- count
if (current != lead) count <- count + 1
}
}
df
#> # A tibble: 20 x 4
#> id type val consec_group
#> <int> <chr> <dbl> <dbl>
#> 1 1 a 0.0606 1
#> 2 2 a 0.501 1
#> 3 3 a 0.974 1
#> 4 4 a 0.0833 1
#> 5 5 a 0.752 1
#> 6 6 b 0.0450 2
#> 7 7 b 0.367 2
#> 8 8 b 0.649 2
#> 9 9 b 0.846 2
#> 10 10 b 0.896 2
#> 11 11 a 0.178 3
#> 12 12 a 0.295 3
#> 13 13 a 0.206 3
#> 14 14 a 0.233 3
#> 15 15 a 0.851 3
#> 16 16 b 0.179 4
#> 17 17 b 0.801 4
#> 18 18 b 0.326 4
#> 19 19 b 0.269 4
#> 20 20 b 0.584 4
Created on 2019-03-14 by the reprex package (v0.2.1)
This grouping of consecutive type occurrences is really just an intermediate step. My endgame is manipulate val for a given consec_group, based on the values of val that occurred within the previous consec_group. Advice on relevant packages would be appreciated.
You say "no data.tables", but are you sure? It's so *** fast and easy (in this case)...
library(data.table)
setDT(df)[, groupid := rleid(type)][]
# id type val groupid
# 1: 1 a 0.624078793 1
# 2: 2 a 0.687361541 1
# 3: 3 a 0.817702740 1
# 4: 4 a 0.669857208 1
# 5: 5 a 0.100977936 1
# 6: 6 b 0.418275823 2
# 7: 7 b 0.660119857 2
# 8: 8 b 0.876015209 2
# 9: 9 b 0.473562143 2
# 10: 10 b 0.284474633 2
# 11: 11 a 0.034154862 3
# 12: 12 a 0.391760387 3
# 13: 13 a 0.383107868 3
# 14: 14 a 0.729583433 3
# 15: 15 a 0.006288375 3
# 16: 16 b 0.530179235 4
# 17: 17 b 0.802643704 4
# 18: 18 b 0.409618633 4
# 19: 19 b 0.309363642 4
# 20: 20 b 0.021918512 4
If you insist on using the tidyverse/dplyr, you can (of course) still use the
rleid-function as follows:
df %>% mutate( groupid = data.table::rleid(type) )
benchmarks
on a larger sample
library(tidyverse)
library(data.table)
# Crete some large test data
df <- tibble(
id = 1:200000,
type = sample(letters[1:26], 200000, replace = TRUE),
val = runif(200000)
)
dt <- as.data.table(df)
microbenchmark::microbenchmark(
dplyr.rleid = df %>% mutate( groupid = data.table::rleid(type) ),
data.table.rleid = dt[, groupid := rleid(type)][],
rle = df %>% mutate(ID_rleid = {ID_rleid = rle(type); rep(seq_along(ID_rleid$lengths), ID_rleid$lengths)}),
rle2 = df %>% mutate(ID_rleid = with(rle(type), rep(seq_along(lengths), lengths))),
transform = transform(df, ID = with(rle(df$type), rep(seq_along(lengths), lengths))),
times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# dplyr.rleid 3.153626 3.278049 3.410363 3.444949 3.502792 3.582626 10
# data.table.rleid 2.965639 3.065959 3.173992 3.145643 3.259672 3.507009 10
# rle 13.059774 14.042797 24.364176 26.126176 29.460561 36.874054 10
# rle2 12.641319 13.553846 30.951152 24.698338 34.139786 102.791719 10
# transform 12.330717 22.419128 22.725242 25.532084 26.187634 26.702794 10
You can use a rleid()-like possibility like this:
df %>%
mutate(ID_rleid = {ID_rleid = rle(type); rep(seq_along(ID_rleid$lengths), ID_rleid$lengths)})
id type val ID_rleid
<int> <chr> <dbl> <int>
1 1 a 0.0430 1
2 2 a 0.858 1
3 3 a 0.504 1
4 4 a 0.318 1
5 5 a 0.469 1
6 6 b 0.144 2
7 7 b 0.173 2
8 8 b 0.0706 2
9 9 b 0.958 2
10 10 b 0.557 2
11 11 a 0.358 3
12 12 a 0.973 3
13 13 a 0.982 3
14 14 a 0.177 3
15 15 a 0.599 3
16 16 b 0.627 4
17 17 b 0.454 4
18 18 b 0.682 4
19 19 b 0.690 4
20 20 b 0.713 4
Or a modification (originally proposed by #d.b) that makes it more handy:
df %>%
mutate(ID_rleid = with(rle(type), rep(seq_along(lengths), lengths)))

Resources