Group by cumulative sums with conditions - r

In this dataframe:
df <- data.frame(
ID = c("C", "B", "B", "B", NA, "C", "A", NA, "B", "B", "B")
)
I'd like to group the rows using cumsum with two conditions: (i) cumsum should not continue if is.na(ID) and (ii) it should not continue if the next ID value is the same as the prior. I do meet condition (i) with this:
df %>%
group_by(grp = cumsum(!is.na(ID)))
# A tibble: 11 x 2
# Groups: grp [9]
ID grp
<chr> <int>
1 C 1
2 B 2
3 B 3
4 B 4
5 NA 4
6 C 5
7 A 6
8 NA 6
9 B 7
10 B 8
11 B 9
but I don't know how to implement condition (ii) too, to obtain the desired result:
1 C 1
2 B 2
3 B 2
4 B 2
5 NA 2
6 C 3
7 A 4
8 NA 4
9 B 5
10 B 5
11 B 5
I tried it with this but I doesn't work:
df %>%
group_by(grp = cumsum(!is.na(ID) |!lag(ID,1) == ID))

Use na.locf0 from zoo to fill in the NAs and then apply rleid from data.table:
library(data.table)
library(zoo)
rleid(na.locf0(df$ID))
## [1] 1 2 2 2 2 3 4 4 5 5 5

Using tidyr and dplyr, you could do:
df %>%
mutate(grp = fill(., ID) %>% pull(),
grp = cumsum(grp != lag(grp, default = first(grp))))
ID grp
1 C 0
2 B 1
3 B 1
4 B 1
5 <NA> 1
6 C 2
7 A 3
8 <NA> 3
9 B 4
10 B 4
11 B 4

Using rle
library(zoo)
with(rle(na.locf0(df$ID)), rep(seq_along(values), lengths))
#[1] 1 2 2 2 2 3 4 4 5 5 5

Related

How can I stack my dataset so each observation relates to all other observations but itself?

I would like to stack my dataset so all observations relate to all other observations but itself.
Suppose I have the following dataset:
df <- data.frame(id = c("a", "b", "c", "d" ),
x1 = c(1,2,3,4))
df
id x1
1 a 1
2 b 2
3 c 3
4 d 4
I would like observation a to be related to b, c, and d. And the same for every other observation. The result should look like something like this:
id x1 id2 x2
1 a 1 b 2
2 a 1 c 3
3 a 1 d 4
4 b 2 a 1
5 b 2 c 3
6 b 2 d 4
7 c 3 a 1
8 c 3 b 2
9 c 3 d 4
10 d 4 a 1
11 d 4 b 2
12 d 4 c 3
So observation a is related to b,c,d. Observation b is related to a, c,d. And so on. Any ideas?
Another option:
library(dplyr)
left_join(df, df, by = character()) %>%
filter(id.x != id.y)
Or
output <- merge(df, df, by = NULL)
output = output[output$id.x != output$id.y,]
Thanks #ritchie-sacramento, I didn't know the by = NULL option for merge before, and thanks #zephryl for the by = character() option for dplyr joins.
tidyr::expand_grid() accepts data frames, which can then be filtered to remove rows that share the id:
library(tidyr)
library(dplyr)
expand_grid(df, df, .name_repair = make.unique) %>%
filter(id != id.1)
# A tibble: 12 × 4
id x1 id.1 x1.1
<chr> <dbl> <chr> <dbl>
1 a 1 b 2
2 a 1 c 3
3 a 1 d 4
4 b 2 a 1
5 b 2 c 3
6 b 2 d 4
7 c 3 a 1
8 c 3 b 2
9 c 3 d 4
10 d 4 a 1
11 d 4 b 2
12 d 4 c 3
You can use combn() to get all combinations of row indices, then assemble your dataframe from those:
rws <- cbind(combn(nrow(df), 2), combn(nrow(df), 2, rev))
df2 <- cbind(df[rws[1, ], ], df[rws[2, ], ])
# clean up row and column names
rownames(df2) <- 1:nrow(df2)
colnames(df2) <- c("id", "x1", "id2", "x2")
df2
id x1 id2 x2
1 a 1 b 2
2 a 1 c 3
3 a 1 d 4
4 b 2 c 3
5 b 2 d 4
6 c 3 d 4
7 b 2 a 1
8 c 3 a 1
9 d 4 a 1
10 c 3 b 2
11 d 4 b 2
12 d 4 c 3

Assign ID to consecutive groups column r

I would like to produce a column in a data.frame that counts the consecutive id of the groups (s column in dummy df)
dummy_df = data.frame(s = c("a", "a", "b","b", "b", "c","c", "a", "a", "c", "c","a","a"),
desired_output= c(1,1,1,1,1,1,1,2,2,2,2,3,3))
dummy_df$rleid_output = rleid(dummy_df$s)
dummy_df
s desired_output rleid_output
1 a 1 1
2 a 1 1
3 b 1 2
4 b 1 2
5 b 1 2
6 c 1 3
7 c 1 3
8 a 2 4
9 a 2 4
10 c 2 5
11 c 2 5
12 a 3 6
13 a 3 6
I would say it's similar to what rleid() does but restarting the counting when a new group is seen. However, I can't find a way to do it in such straight way. Thanks.
You can do:
dummy_df$out <- with(rle(dummy_df$s), rep(ave(lengths, values, FUN = seq_along), lengths))
Result:
s desired_output out
1 a 1 1
2 a 1 1
3 b 1 1
4 b 1 1
5 b 1 1
6 c 1 1
7 c 1 1
8 a 2 2
9 a 2 2
10 c 2 2
11 c 2 2
12 a 3 3
13 a 3 3
If you are willing to use data.table (rleid is part of the package), you can do it in two steps as follows:
library(data.table)
dummy_df = data.frame(s = c("a", "a", "b", "b", "b", "c", "c", "a", "a", "c", "c", "a", "a"))
# cast data.frame to data.table
setDT(dummy_df)
# create auxiliary variable
dummy_df[, rleid_output := rleid(s)]
# obtain desired output
dummy_df[, desired_output := rleid(rleid_output), by = "s"]
# end result
dummy_df
#> s rleid_output desired_output
#> 1: a 1 1
#> 2: a 1 1
#> 3: b 2 1
#> 4: b 2 1
#> 5: b 2 1
#> 6: c 3 1
#> 7: c 3 1
#> 8: a 4 2
#> 9: a 4 2
#> 10: c 5 2
#> 11: c 5 2
#> 12: a 6 3
#> 13: a 6 3
Created on 2020-10-16 by the reprex package (v0.3.0)
you can try a tidyverse in combination with the base R rle function
library(tidyverse)
rle(dummy_df$s) %>%
with(., data.frame(a=.$length, b=.$value)) %>%
group_by(b) %>%
mutate(n = 1:n()) %>%
with(., rep(n, times=a)) %>%
bind_cols(dummy_df, res=.)
s desired_output res
1 a 1 1
2 a 1 1
3 b 1 1
4 b 1 1
5 b 1 1
6 c 1 1
7 c 1 1
8 a 2 2
9 a 2 2
10 c 2 2
11 c 2 2
12 a 3 3
13 a 3 3

Merging datasets by id and maintain one row for each id

I am attempting to merge 2 datasets belonging to a single id with a larger dataset.
However, I am having trouble merging the two single row datasets into a single row within the larger dataset.
Is there a simple way to merge with dplyr and only overwrite values if they are NA's?
My data:
df1 <- data.frame(id=1:5, b=6:10, c=c("a", "b", "c", "d", "e"), d=c(NA, 1,2,3, 4))
df2 <- data.frame(id=6, b=2, c="f", d=NA_real_)
df3 <- data.frame(id=6, b=NA_real_, c=NA_character_, d=5, e="a")
> df1
id b c d
1 1 6 a NA
2 2 7 b 1
3 3 8 c 2
4 4 9 d 3
5 5 10 e 4
> df2
id b c d
1 6 2 f NA
> df3
id b c d e
1 6 NA <NA> 5 a
My attempt:
merge1 <- dplyr::full_join(df1, df2) %>% full_join(df3)
Desired output:
output <- data.frame(id=1:6, b=c(6:10,2), c=c("a", "b", "c", "d", "e", "f"), d=c(NA, 1,2,3, 4, 5), e=c(NA,NA, NA, NA, NA, "a"))
> output
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
As opposed to:
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f NA <NA>
7 6 NA <NA> 5 a
Thank you
You can try:
list(df1, df2, df3) %>%
bind_rows() %>%
group_by(id) %>%
summarise_all(~ first(na.omit(.)))
id b c d e
<dbl> <dbl> <chr> <dbl> <fct>
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
you can try
library(tidyverse)
df1 %>%
mutate_if(is.factor, as.character) %>%
bind_rows(mutate_if(df2, is.factor, as.character)) %>%
left_join(select(df3, id, d, e), by = "id") %>%
mutate(d= ifelse(is.na(d.x), d.y, d.x)) %>%
select(-d.x, -d.y)

Apply a custom function after group_by using dplyr in R

How do I apply a function after group_by using dplyr to remove those groups with 2 or more consecutive NAs? I have written a function that outputs True or False whether a column in a dataframe has 2 or more NAs:
# function for determining if ts contains consecutive NAs
is.na.contiguous <- function(df, consecutive) {
na.rle <- rle(is.na(df$b))
na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
any(na.rle$values)
}
# example df
d = structure(list(a = c(1, 2, 3, 4, 5, 6, 7, 8), b = c(1, 2, 2,
+ NA, NA, 2, NA, 2), c = c(1, 1, 1, 2, 2, 2, 3, 3)), class = "data.frame", row.names = c(NA,
+ -8L))
head(d)
a b c
1 1 1 1
2 2 2 1
3 3 2 1
4 4 NA 2
5 5 NA 2
6 6 2 2
7 7 NA 3
8 8 2 3
# test function
is.na.contiguous(d,2)
TRUE # column b has 2 consecutive NAs
is.na.contiguous(d,3)
FALSE # column b does not have 3 consecutive NAs
Now how do I apply this function to each group in the dataframe? Below is what I have tried:
d %>% group_by(c) %>% mutate(consecNA = is.na.contiguous(.,2)) %>% as.data.frame()
a b c consecNA
1 1 1 1 TRUE
2 2 2 1 TRUE
3 3 2 1 TRUE
4 4 NA 2 TRUE
5 5 NA 2 TRUE
6 6 2 2 TRUE
7 7 NA 3 TRUE
8 8 2 3 TRUE
What am I doing wrong?
Instead of passing the entire dataframe to is.na.contiguous, pass only the column value then it would be simple to apply it via group and also it would become flexible if you want to do the same for some different column.
is.na.contiguous <- function(x, consecutive) {
na.rle <- rle(is.na(x))
na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
any(na.rle$values)
}
library(dplyr)
d %>%
group_by(c) %>%
filter(!is.na.contiguous(b, 2))
# a b c
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 2 2 1
#3 3 2 1
#4 7 NA 3
#5 8 2 3
An option would be to use rleid from data.table on the logical vector (is.na(b)), and use that to subset the groups having number of rows greater than or equal to 2 and if all the elements are NA
library(data.table)
i1 <- setDT(d)[, .I[!(.N >=2 & all(is.na(b)))], rleid(is.na(b))]$V1
d[i1]
#. a b c
#1: 1 1 1
#2: 2 2 1
#3: 3 2 1
#4: 6 2 2
#5: 7 NA 3
#6: 8 2 3
Or if we need to also group by 'c'
setDT(d)[d[, .I[sum(is.na(b)) <2], .(grp = rleid(is.na(b)), c)]$V1]
or with tidyverse
library(dplyr)
d %>%
group_by(grp = rleid(is.na(b))) %>%
filter(!(n() >=2 & all(is.na(b))))
# A tibble: 6 x 4
# Groups: grp [4]
# a b c grp
# <dbl> <dbl> <dbl> <int>
#1 1 1 1 1
#2 2 2 1 1
#3 3 2 1 1
#4 6 2 2 3
#5 7 NA 3 4
#6 8 2 3 5
Or another option is to get the sum of logical vector and check if it is less than 2
d %>%
group_by(c, grp = rleid(is.na(b))) %>%
filter(sum(is.na(b))<2)
If we are using the function from OP
is.na.contiguous <- function(x, consecutive) {
na.rle <- rle(is.na(x))
with(na.rle, any(values & na.rle$lengths >= consecutive))
}
d %>%
group_by(c) %>%
mutate(consecNA = is.na.contiguous(b, 2))
# A tibble: 8 x 4
# Groups: c [3]
# a b c consecNA
# <dbl> <dbl> <dbl> <lgl>
#1 1 1 1 FALSE
#2 2 2 1 FALSE
#3 3 2 1 FALSE
#4 4 NA 2 TRUE
#5 5 NA 2 TRUE
#6 6 2 2 TRUE
#7 7 NA 3 FALSE
#8 8 2 3 FALSE

Expand dataframe by ID to generate a special column

I have the following dataframe
df<-data.frame("ID"=c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
'A_Frequency'=c(1,2,3,4,5,1,2,3,4,5),
'B_Frequency'=c(1,2,NA,4,6,1,2,5,6,7))
The dataframe appears as follows
ID A_Frequency B_Frequency
1 A 1 1
2 A 2 2
3 A 3 NA
4 A 4 4
5 A 5 6
6 B 1 1
7 B 2 2
8 B 3 5
9 B 4 6
10 B 5 7
I Wish to create a new dataframe df2 from df that looks as follows
ID CFreq
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
6 A 6
7 B 1
8 B 2
9 B 3
10 B 4
11 B 5
12 B 6
13 B 7
The new dataframe has a column CFreq that takes unique values from A_Frequency, B_Frequency and groups them by ID. Then it ignores the NA values and generates the CFreq column
I have tried dplyr but am unable to get the required response
df2<-df%>%group_by(ID)%>%select(ID, A_Frequency,B_Frequency)%>%
mutate(Cfreq=unique(A_Frequency, B_Frequency))
This yields the following which is quite different
ID A_Frequency B_Frequency Cfreq
<fct> <dbl> <dbl> <dbl>
1 A 1 1 1
2 A 2 2 2
3 A 3 NA 3
4 A 4 4 4
5 A 5 6 5
6 B 1 1 1
7 B 2 2 2
8 B 3 5 3
9 B 4 6 4
10 B 5 7 5
Request someone to help me here
gather function from tidyr package will be helpful here:
library(tidyverse)
df %>%
gather(x, CFreq, -ID) %>%
select(-x) %>%
na.omit() %>%
unique() %>%
arrange(ID, CFreq)
A different tidyverse possibility could be:
df %>%
nest(A_Frequency, B_Frequency, .key = C_Frequency) %>%
mutate(C_Frequency = map(C_Frequency, function(x) unique(x[!is.na(x)]))) %>%
unnest()
ID C_Frequency
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
9 A 6
10 B 1
11 B 2
12 B 3
13 B 4
14 B 5
18 B 6
19 B 7
Base R approach would be to split the dataframe based on ID and for every list we count the number of unique enteries and create a sequence based on that.
do.call(rbind, lapply(split(df, df$ID), function(x) data.frame(ID = x$ID[1] ,
CFreq = seq_len(length(unique(na.omit(unlist(x[-1]))))))))
# ID CFreq
#A.1 A 1
#A.2 A 2
#A.3 A 3
#A.4 A 4
#A.5 A 5
#A.6 A 6
#B.1 B 1
#B.2 B 2
#B.3 B 3
#B.4 B 4
#B.5 B 5
#B.6 B 6
#B.7 B 7
This will also work when A_Frequency B_Frequency has characters in them or some other random numbers instead of sequential numbers.
In tidyverse we can do
library(tidyverse)
df %>%
group_split(ID) %>%
map_dfr(~ data.frame(ID = .$ID[1],
CFreq= seq_len(length(unique(na.omit(flatten_chr(.[-1])))))))
A data.table option
library(data.table)
cols <- c('A_Frequency', 'B_Frequency')
out <- setDT(df)[, .(CFreq = sort(unique(unlist(.SD)))),
.SDcols = cols,
by = ID]
out
# ID CFreq
# 1: A 1
# 2: A 2
# 3: A 3
# 4: A 4
# 5: A 5
# 6: A 6
# 7: B 1
# 8: B 2
# 9: B 3
#10: B 4
#11: B 5
#12: B 6
#13: B 7

Resources