Assign ID to consecutive groups column r - r

I would like to produce a column in a data.frame that counts the consecutive id of the groups (s column in dummy df)
dummy_df = data.frame(s = c("a", "a", "b","b", "b", "c","c", "a", "a", "c", "c","a","a"),
desired_output= c(1,1,1,1,1,1,1,2,2,2,2,3,3))
dummy_df$rleid_output = rleid(dummy_df$s)
dummy_df
s desired_output rleid_output
1 a 1 1
2 a 1 1
3 b 1 2
4 b 1 2
5 b 1 2
6 c 1 3
7 c 1 3
8 a 2 4
9 a 2 4
10 c 2 5
11 c 2 5
12 a 3 6
13 a 3 6
I would say it's similar to what rleid() does but restarting the counting when a new group is seen. However, I can't find a way to do it in such straight way. Thanks.

You can do:
dummy_df$out <- with(rle(dummy_df$s), rep(ave(lengths, values, FUN = seq_along), lengths))
Result:
s desired_output out
1 a 1 1
2 a 1 1
3 b 1 1
4 b 1 1
5 b 1 1
6 c 1 1
7 c 1 1
8 a 2 2
9 a 2 2
10 c 2 2
11 c 2 2
12 a 3 3
13 a 3 3

If you are willing to use data.table (rleid is part of the package), you can do it in two steps as follows:
library(data.table)
dummy_df = data.frame(s = c("a", "a", "b", "b", "b", "c", "c", "a", "a", "c", "c", "a", "a"))
# cast data.frame to data.table
setDT(dummy_df)
# create auxiliary variable
dummy_df[, rleid_output := rleid(s)]
# obtain desired output
dummy_df[, desired_output := rleid(rleid_output), by = "s"]
# end result
dummy_df
#> s rleid_output desired_output
#> 1: a 1 1
#> 2: a 1 1
#> 3: b 2 1
#> 4: b 2 1
#> 5: b 2 1
#> 6: c 3 1
#> 7: c 3 1
#> 8: a 4 2
#> 9: a 4 2
#> 10: c 5 2
#> 11: c 5 2
#> 12: a 6 3
#> 13: a 6 3
Created on 2020-10-16 by the reprex package (v0.3.0)

you can try a tidyverse in combination with the base R rle function
library(tidyverse)
rle(dummy_df$s) %>%
with(., data.frame(a=.$length, b=.$value)) %>%
group_by(b) %>%
mutate(n = 1:n()) %>%
with(., rep(n, times=a)) %>%
bind_cols(dummy_df, res=.)
s desired_output res
1 a 1 1
2 a 1 1
3 b 1 1
4 b 1 1
5 b 1 1
6 c 1 1
7 c 1 1
8 a 2 2
9 a 2 2
10 c 2 2
11 c 2 2
12 a 3 3
13 a 3 3

Related

filling NA with values from another table

I have the following datasets in RStudio:
df =
a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D
and fill_with =
a b
1 A
2 B
3 C
How do I fill the NA values in df in the b column according to the a column?
Ex: a=1, b=NA, then I look at the table fill_with at a=1, and I see that I should fill it with b=A.
In the end it should look the following way:
df =
a b
1 A
1 A
1 A
1 A
2 C
2 B
2 B
3 A
3 C
3 C
3 D
We can use ifelse
df$b <- ifelse(is.na(df$b) ,
fill_with$b[match(df$a , fill_with$a)] , df$b)
Output
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- read_table("a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D")
df %>%
group_by(a) %>%
fill(b, .direction = "updown")
# A tibble: 11 x 2
# Groups: a [3]
a b
<dbl> <chr>
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
Base R
tmp=which(is.na(df$b))
df$b[tmp]=fill_with$b[match(df$a,fill_with$a)[tmp]]
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
a = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3),
b = c("A", NA, "A", NA, "C", NA, "B", "A", NA, "C", "D")
)
fill_with <- data.frame(
stringsAsFactors = FALSE,
a = c(1L, 2L, 3L),
b = c("A", "B", "C")
)
rows_update(x = df, y = fill_with, by = "a")
#> a b
#> 1 1 A
#> 2 1 A
#> 3 1 A
#> 4 1 A
#> 5 2 B
#> 6 2 B
#> 7 2 B
#> 8 3 C
#> 9 3 C
#> 10 3 C
#> 11 3 C
Created on 2022-08-22 with reprex v2.0.2

How to create unique data frame rows from inputs of 2 lists

In this example I have a list with 4 values (Lot) and another with 3 (Method).
Lot <- c("A", "B", "C", "D")
Method <- c(1,2,3)
I need to create a data frame with a Lot and Method column where their values are repeated so each row is unique. I need it to look like this:
# Lot Method
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 1
# 5 B 2
# 6 B 3
# 7 C 1
# 8 C 2
# 9 C 3
# 10 D 1
# 11 D 2
# 12 D 3
How can this be done without creating 2 long repetitive lists like this:
Lot <- c("A","A","A", "B","B","B","C","C","C","D","D","D")
Method <- c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3)
Using expand.grid you could do
expand.grid(
Lot = c("A", "B", "C", "D"),
Method = c(1,2,3)
)
#> Lot Method
#> 1 A 1
#> 2 B 1
#> 3 C 1
#> 4 D 1
#> 5 A 2
#> 6 B 2
#> 7 C 2
#> 8 D 2
#> 9 A 3
#> 10 B 3
#> 11 C 3
#> 12 D 3
Or to get the right order we could do (thanks to #onyambu for pointing that out):
rev(expand.grid(
Method = c(1,2,3),
Lot = c("A", "B", "C", "D")
))
#> Lot Method
#> 1 A 1
#> 5 A 2
#> 9 A 3
#> 2 B 1
#> 6 B 2
#> 10 B 3
#> 3 C 1
#> 7 C 2
#> 11 C 3
#> 4 D 1
#> 8 D 2
#> 12 D 3
Or using the tidyverse you could do:
tidyr::expand_grid(
Lot = c("A", "B", "C", "D"),
Method = c(1,2,3)
)
#> # A tibble: 12 × 2
#> Lot Method
#> <chr> <dbl>
#> 1 A 1
#> 2 A 2
#> 3 A 3
#> 4 B 1
#> 5 B 2
#> 6 B 3
#> 7 C 1
#> 8 C 2
#> 9 C 3
#> 10 D 1
#> 11 D 2
#> 12 D 3
With data.table, we can use CJ:
library(data.table)
CJ(Lot = c("A", "B", "C", "D"),
Method = c(1, 2, 3))
Output
Lot Method
1: A 1
2: A 2
3: A 3
4: B 1
5: B 2
6: B 3
7: C 1
8: C 2
9: C 3
10: D 1
11: D 2
12: D 3

Group by cumulative sums with conditions

In this dataframe:
df <- data.frame(
ID = c("C", "B", "B", "B", NA, "C", "A", NA, "B", "B", "B")
)
I'd like to group the rows using cumsum with two conditions: (i) cumsum should not continue if is.na(ID) and (ii) it should not continue if the next ID value is the same as the prior. I do meet condition (i) with this:
df %>%
group_by(grp = cumsum(!is.na(ID)))
# A tibble: 11 x 2
# Groups: grp [9]
ID grp
<chr> <int>
1 C 1
2 B 2
3 B 3
4 B 4
5 NA 4
6 C 5
7 A 6
8 NA 6
9 B 7
10 B 8
11 B 9
but I don't know how to implement condition (ii) too, to obtain the desired result:
1 C 1
2 B 2
3 B 2
4 B 2
5 NA 2
6 C 3
7 A 4
8 NA 4
9 B 5
10 B 5
11 B 5
I tried it with this but I doesn't work:
df %>%
group_by(grp = cumsum(!is.na(ID) |!lag(ID,1) == ID))
Use na.locf0 from zoo to fill in the NAs and then apply rleid from data.table:
library(data.table)
library(zoo)
rleid(na.locf0(df$ID))
## [1] 1 2 2 2 2 3 4 4 5 5 5
Using tidyr and dplyr, you could do:
df %>%
mutate(grp = fill(., ID) %>% pull(),
grp = cumsum(grp != lag(grp, default = first(grp))))
ID grp
1 C 0
2 B 1
3 B 1
4 B 1
5 <NA> 1
6 C 2
7 A 3
8 <NA> 3
9 B 4
10 B 4
11 B 4
Using rle
library(zoo)
with(rle(na.locf0(df$ID)), rep(seq_along(values), lengths))
#[1] 1 2 2 2 2 3 4 4 5 5 5

Expand dataframe by ID to generate a special column

I have the following dataframe
df<-data.frame("ID"=c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
'A_Frequency'=c(1,2,3,4,5,1,2,3,4,5),
'B_Frequency'=c(1,2,NA,4,6,1,2,5,6,7))
The dataframe appears as follows
ID A_Frequency B_Frequency
1 A 1 1
2 A 2 2
3 A 3 NA
4 A 4 4
5 A 5 6
6 B 1 1
7 B 2 2
8 B 3 5
9 B 4 6
10 B 5 7
I Wish to create a new dataframe df2 from df that looks as follows
ID CFreq
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
6 A 6
7 B 1
8 B 2
9 B 3
10 B 4
11 B 5
12 B 6
13 B 7
The new dataframe has a column CFreq that takes unique values from A_Frequency, B_Frequency and groups them by ID. Then it ignores the NA values and generates the CFreq column
I have tried dplyr but am unable to get the required response
df2<-df%>%group_by(ID)%>%select(ID, A_Frequency,B_Frequency)%>%
mutate(Cfreq=unique(A_Frequency, B_Frequency))
This yields the following which is quite different
ID A_Frequency B_Frequency Cfreq
<fct> <dbl> <dbl> <dbl>
1 A 1 1 1
2 A 2 2 2
3 A 3 NA 3
4 A 4 4 4
5 A 5 6 5
6 B 1 1 1
7 B 2 2 2
8 B 3 5 3
9 B 4 6 4
10 B 5 7 5
Request someone to help me here
gather function from tidyr package will be helpful here:
library(tidyverse)
df %>%
gather(x, CFreq, -ID) %>%
select(-x) %>%
na.omit() %>%
unique() %>%
arrange(ID, CFreq)
A different tidyverse possibility could be:
df %>%
nest(A_Frequency, B_Frequency, .key = C_Frequency) %>%
mutate(C_Frequency = map(C_Frequency, function(x) unique(x[!is.na(x)]))) %>%
unnest()
ID C_Frequency
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
9 A 6
10 B 1
11 B 2
12 B 3
13 B 4
14 B 5
18 B 6
19 B 7
Base R approach would be to split the dataframe based on ID and for every list we count the number of unique enteries and create a sequence based on that.
do.call(rbind, lapply(split(df, df$ID), function(x) data.frame(ID = x$ID[1] ,
CFreq = seq_len(length(unique(na.omit(unlist(x[-1]))))))))
# ID CFreq
#A.1 A 1
#A.2 A 2
#A.3 A 3
#A.4 A 4
#A.5 A 5
#A.6 A 6
#B.1 B 1
#B.2 B 2
#B.3 B 3
#B.4 B 4
#B.5 B 5
#B.6 B 6
#B.7 B 7
This will also work when A_Frequency B_Frequency has characters in them or some other random numbers instead of sequential numbers.
In tidyverse we can do
library(tidyverse)
df %>%
group_split(ID) %>%
map_dfr(~ data.frame(ID = .$ID[1],
CFreq= seq_len(length(unique(na.omit(flatten_chr(.[-1])))))))
A data.table option
library(data.table)
cols <- c('A_Frequency', 'B_Frequency')
out <- setDT(df)[, .(CFreq = sort(unique(unlist(.SD)))),
.SDcols = cols,
by = ID]
out
# ID CFreq
# 1: A 1
# 2: A 2
# 3: A 3
# 4: A 4
# 5: A 5
# 6: A 6
# 7: B 1
# 8: B 2
# 9: B 3
#10: B 4
#11: B 5
#12: B 6
#13: B 7

Conditional statement within group

I have a dataframe in which I want to make a new column with values based on condition within groups. So for the dataframe below, I want to make a new column n_actions which gives
Cond1. for the whole group GROUP the number 2 if a 6 appears in column STEP
Cond 2. for the whole group GROUP the number 3 if a 9 appears in column STEP
Cond 3. if not a 6 or 9 appears within column STEP for the GROUP, then 1
#dataframe start
dataframe <- data.frame(group = c("A", "A", "A", "B", "B", "B", "B", "B", "B", "C", "C", "C", "D", "D", "D", "D", "D", "D", "D", "D", "D"),
step = c(1, 2, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9))
# dataframe desired
dataframe$n_actions <- c(rep(1, 3), rep(2, 6,), rep(1, 3), rep(3, 9))
Try out:
library(dplyr)
dataframe %>%
group_by(group) %>%
mutate(n_actions = ifelse(9 %in% step, 3,
ifelse(6 %in% step, 2, 1)))
# A tibble: 21 x 3
# Groups: group [4]
group step n_actions
<fctr> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 1
4 B 1 2
5 B 2 2
6 B 3 2
7 B 4 2
8 B 5 2
9 B 6 2
10 C 1 1
# ... with 11 more rows
Another way with dplyr's case_when:
library(dplyr)
dataframe %>%
group_by(group) %>%
mutate(
n_actions1 = case_when(
9 %in% step ~ 3,
6 %in% step ~ 2,
TRUE ~ 1
)
)
Output:
# A tibble: 21 x 3
# Groups: group [4]
group step n_actions
<fct> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 1
4 B 1 2
5 B 2 2
6 B 3 2
7 B 4 2
8 B 5 2
9 B 6 2
10 C 1 1
11 C 2 1
12 C 3 1
13 D 1 3
14 D 2 3
15 D 3 3
16 D 4 3
17 D 5 3
18 D 6 3
19 D 7 3
20 D 8 3
21 D 9 3
You could divide the maximum value per group by %/% 3, it seems.
dataframe <- transform(dataframe,
n_actions2 = ave(step, group, FUN = function(x) max(x) %/% 3))
dataframe
# group step n_actions n_actions2
#1 A 1 1 1
#2 A 2 1 1
#3 A 3 1 1
#4 B 1 2 2
#5 B 2 2 2
#6 B 3 2 2
#7 B 4 2 2
#8 B 5 2 2
#9 B 6 2 2
#10 C 1 1 1
#11 C 2 1 1
#12 C 3 1 1
#13 D 1 3 3
#14 D 2 3 3
#15 D 3 3 3
#16 D 4 3 3
#17 D 5 3 3
#18 D 6 3 3
#19 D 7 3 3
#20 D 8 3 3
#21 D 9 3 3

Resources