cumsum is.na with rle ignoring consectives NA's - r

simple problem. Lets say I have the following data:
library(tidyverse)
df <- data.frame(group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2),
variable = c(NA, "a", NA, "b", "c", NA, NA, NA, NA, "a", NA, "c", NA, NA, "d", NA, NA, "a"))
df
group variable
1 1 <NA>
2 1 a
3 1 <NA>
4 1 b
5 1 c
6 1 <NA>
7 1 <NA>
8 1 <NA>
9 1 <NA>
10 1 a
11 1 <NA>
12 1 c
13 1 <NA>
14 1 <NA>
15 1 d
16 2 <NA>
17 2 <NA>
18 2 a
I just want to count missing variables using cumsum(is.na(variable) but ignore consecutive missing ones so my desired output would look like:
group variable newvariable
1 1 <NA> 1
2 1 a 1
3 1 <NA> 2
4 1 b 2
5 1 c 2
6 1 <NA> 3
7 1 <NA> 3
8 1 <NA> 3
9 1 <NA> 3
10 1 a 3
11 1 <NA> 4
12 1 c 4
13 1 <NA> 5
14 1 <NA> 5
15 1 d 5
16 2 <NA> 1
17 2 <NA> 1
18 2 a 1
I think I need to incorporate rle into my code:
df %>%
group_by(group, na_group = {na_group = rle(variable); rep(seq_along(na_group$lengths), na_group$lengths)}) %>%
mutate(newvariable = cumsum((is.na(variable)))) #?
Maybe map over groups could work. Any suggestions please?
Refs:
Identify sets of NA in a vector
Count consecutive values in groups with condition with dplyr and rle

df %>%
group_by(group) %>%
mutate(new = with(rle(is.na(variable)), rep(cumsum(values), lengths))) %>%
ungroup()

Another option is to use diff with cumsum on the logical vector
library(data.table)
setDT(df)[, new := cumsum(c(TRUE, diff(is.na(variable)) > 0) ), group ]
Or with dplyr
library(dplyr)
df %>%
group_by(group) %>%
mutate(new = cumsum(c(TRUE, diff(is.na(variable)) > 0)))
# A tibble: 18 x 3
# Groups: group [2]
# group variable new
# <dbl> <fct> <int>
# 1 1 <NA> 1
# 2 1 a 1
# 3 1 <NA> 2
# 4 1 b 2
# 5 1 c 2
# 6 1 <NA> 3
# 7 1 <NA> 3
# 8 1 <NA> 3
# 9 1 <NA> 3
#10 1 a 3
#11 1 <NA> 4
#12 1 c 4
#13 1 <NA> 5
#14 1 <NA> 5
#15 1 d 5
#16 2 <NA> 1
#17 2 <NA> 1
#18 2 a 1

Related

filling NA with values from another table

I have the following datasets in RStudio:
df =
a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D
and fill_with =
a b
1 A
2 B
3 C
How do I fill the NA values in df in the b column according to the a column?
Ex: a=1, b=NA, then I look at the table fill_with at a=1, and I see that I should fill it with b=A.
In the end it should look the following way:
df =
a b
1 A
1 A
1 A
1 A
2 C
2 B
2 B
3 A
3 C
3 C
3 D
We can use ifelse
df$b <- ifelse(is.na(df$b) ,
fill_with$b[match(df$a , fill_with$a)] , df$b)
Output
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- read_table("a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D")
df %>%
group_by(a) %>%
fill(b, .direction = "updown")
# A tibble: 11 x 2
# Groups: a [3]
a b
<dbl> <chr>
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
Base R
tmp=which(is.na(df$b))
df$b[tmp]=fill_with$b[match(df$a,fill_with$a)[tmp]]
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
a = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3),
b = c("A", NA, "A", NA, "C", NA, "B", "A", NA, "C", "D")
)
fill_with <- data.frame(
stringsAsFactors = FALSE,
a = c(1L, 2L, 3L),
b = c("A", "B", "C")
)
rows_update(x = df, y = fill_with, by = "a")
#> a b
#> 1 1 A
#> 2 1 A
#> 3 1 A
#> 4 1 A
#> 5 2 B
#> 6 2 B
#> 7 2 B
#> 8 3 C
#> 9 3 C
#> 10 3 C
#> 11 3 C
Created on 2022-08-22 with reprex v2.0.2

Group by cumulative sums with conditions

In this dataframe:
df <- data.frame(
ID = c("C", "B", "B", "B", NA, "C", "A", NA, "B", "B", "B")
)
I'd like to group the rows using cumsum with two conditions: (i) cumsum should not continue if is.na(ID) and (ii) it should not continue if the next ID value is the same as the prior. I do meet condition (i) with this:
df %>%
group_by(grp = cumsum(!is.na(ID)))
# A tibble: 11 x 2
# Groups: grp [9]
ID grp
<chr> <int>
1 C 1
2 B 2
3 B 3
4 B 4
5 NA 4
6 C 5
7 A 6
8 NA 6
9 B 7
10 B 8
11 B 9
but I don't know how to implement condition (ii) too, to obtain the desired result:
1 C 1
2 B 2
3 B 2
4 B 2
5 NA 2
6 C 3
7 A 4
8 NA 4
9 B 5
10 B 5
11 B 5
I tried it with this but I doesn't work:
df %>%
group_by(grp = cumsum(!is.na(ID) |!lag(ID,1) == ID))
Use na.locf0 from zoo to fill in the NAs and then apply rleid from data.table:
library(data.table)
library(zoo)
rleid(na.locf0(df$ID))
## [1] 1 2 2 2 2 3 4 4 5 5 5
Using tidyr and dplyr, you could do:
df %>%
mutate(grp = fill(., ID) %>% pull(),
grp = cumsum(grp != lag(grp, default = first(grp))))
ID grp
1 C 0
2 B 1
3 B 1
4 B 1
5 <NA> 1
6 C 2
7 A 3
8 <NA> 3
9 B 4
10 B 4
11 B 4
Using rle
library(zoo)
with(rle(na.locf0(df$ID)), rep(seq_along(values), lengths))
#[1] 1 2 2 2 2 3 4 4 5 5 5

Merging datasets by id and maintain one row for each id

I am attempting to merge 2 datasets belonging to a single id with a larger dataset.
However, I am having trouble merging the two single row datasets into a single row within the larger dataset.
Is there a simple way to merge with dplyr and only overwrite values if they are NA's?
My data:
df1 <- data.frame(id=1:5, b=6:10, c=c("a", "b", "c", "d", "e"), d=c(NA, 1,2,3, 4))
df2 <- data.frame(id=6, b=2, c="f", d=NA_real_)
df3 <- data.frame(id=6, b=NA_real_, c=NA_character_, d=5, e="a")
> df1
id b c d
1 1 6 a NA
2 2 7 b 1
3 3 8 c 2
4 4 9 d 3
5 5 10 e 4
> df2
id b c d
1 6 2 f NA
> df3
id b c d e
1 6 NA <NA> 5 a
My attempt:
merge1 <- dplyr::full_join(df1, df2) %>% full_join(df3)
Desired output:
output <- data.frame(id=1:6, b=c(6:10,2), c=c("a", "b", "c", "d", "e", "f"), d=c(NA, 1,2,3, 4, 5), e=c(NA,NA, NA, NA, NA, "a"))
> output
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
As opposed to:
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f NA <NA>
7 6 NA <NA> 5 a
Thank you
You can try:
list(df1, df2, df3) %>%
bind_rows() %>%
group_by(id) %>%
summarise_all(~ first(na.omit(.)))
id b c d e
<dbl> <dbl> <chr> <dbl> <fct>
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
you can try
library(tidyverse)
df1 %>%
mutate_if(is.factor, as.character) %>%
bind_rows(mutate_if(df2, is.factor, as.character)) %>%
left_join(select(df3, id, d, e), by = "id") %>%
mutate(d= ifelse(is.na(d.x), d.y, d.x)) %>%
select(-d.x, -d.y)

Apply a custom function after group_by using dplyr in R

How do I apply a function after group_by using dplyr to remove those groups with 2 or more consecutive NAs? I have written a function that outputs True or False whether a column in a dataframe has 2 or more NAs:
# function for determining if ts contains consecutive NAs
is.na.contiguous <- function(df, consecutive) {
na.rle <- rle(is.na(df$b))
na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
any(na.rle$values)
}
# example df
d = structure(list(a = c(1, 2, 3, 4, 5, 6, 7, 8), b = c(1, 2, 2,
+ NA, NA, 2, NA, 2), c = c(1, 1, 1, 2, 2, 2, 3, 3)), class = "data.frame", row.names = c(NA,
+ -8L))
head(d)
a b c
1 1 1 1
2 2 2 1
3 3 2 1
4 4 NA 2
5 5 NA 2
6 6 2 2
7 7 NA 3
8 8 2 3
# test function
is.na.contiguous(d,2)
TRUE # column b has 2 consecutive NAs
is.na.contiguous(d,3)
FALSE # column b does not have 3 consecutive NAs
Now how do I apply this function to each group in the dataframe? Below is what I have tried:
d %>% group_by(c) %>% mutate(consecNA = is.na.contiguous(.,2)) %>% as.data.frame()
a b c consecNA
1 1 1 1 TRUE
2 2 2 1 TRUE
3 3 2 1 TRUE
4 4 NA 2 TRUE
5 5 NA 2 TRUE
6 6 2 2 TRUE
7 7 NA 3 TRUE
8 8 2 3 TRUE
What am I doing wrong?
Instead of passing the entire dataframe to is.na.contiguous, pass only the column value then it would be simple to apply it via group and also it would become flexible if you want to do the same for some different column.
is.na.contiguous <- function(x, consecutive) {
na.rle <- rle(is.na(x))
na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
any(na.rle$values)
}
library(dplyr)
d %>%
group_by(c) %>%
filter(!is.na.contiguous(b, 2))
# a b c
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 2 2 1
#3 3 2 1
#4 7 NA 3
#5 8 2 3
An option would be to use rleid from data.table on the logical vector (is.na(b)), and use that to subset the groups having number of rows greater than or equal to 2 and if all the elements are NA
library(data.table)
i1 <- setDT(d)[, .I[!(.N >=2 & all(is.na(b)))], rleid(is.na(b))]$V1
d[i1]
#. a b c
#1: 1 1 1
#2: 2 2 1
#3: 3 2 1
#4: 6 2 2
#5: 7 NA 3
#6: 8 2 3
Or if we need to also group by 'c'
setDT(d)[d[, .I[sum(is.na(b)) <2], .(grp = rleid(is.na(b)), c)]$V1]
or with tidyverse
library(dplyr)
d %>%
group_by(grp = rleid(is.na(b))) %>%
filter(!(n() >=2 & all(is.na(b))))
# A tibble: 6 x 4
# Groups: grp [4]
# a b c grp
# <dbl> <dbl> <dbl> <int>
#1 1 1 1 1
#2 2 2 1 1
#3 3 2 1 1
#4 6 2 2 3
#5 7 NA 3 4
#6 8 2 3 5
Or another option is to get the sum of logical vector and check if it is less than 2
d %>%
group_by(c, grp = rleid(is.na(b))) %>%
filter(sum(is.na(b))<2)
If we are using the function from OP
is.na.contiguous <- function(x, consecutive) {
na.rle <- rle(is.na(x))
with(na.rle, any(values & na.rle$lengths >= consecutive))
}
d %>%
group_by(c) %>%
mutate(consecNA = is.na.contiguous(b, 2))
# A tibble: 8 x 4
# Groups: c [3]
# a b c consecNA
# <dbl> <dbl> <dbl> <lgl>
#1 1 1 1 FALSE
#2 2 2 1 FALSE
#3 3 2 1 FALSE
#4 4 NA 2 TRUE
#5 5 NA 2 TRUE
#6 6 2 2 TRUE
#7 7 NA 3 FALSE
#8 8 2 3 FALSE

Variable/column selection in tidyr fill()

Suppose a df with some missing values like this:
ID col_A_1 col_A_2 col_B_1 col_B_2
1 1 1 NA NA a
2 1 2 NA 1 b
3 1 3 1 2 c
4 1 4 2 3 d
5 1 NA 3 4 e
6 2 NA 1 5 f
7 2 NA 2 6 g
8 2 1 3 7 h
9 2 2 4 8 <NA>
10 2 3 5 NA <NA>
I want to fill the missing values using tidyr fill(), however, only the missing values in columns containing A.
I was able to achieve it using:
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
fill(names(.)[grepl("A", names(.))], .direction = "up") %>%
fill(names(.)[grepl("A", names(.))], .direction = "down") %>%
ungroup()
ID col_A_1 col_A_2 col_B_1 col_B_2
<dbl> <int> <int> <int> <chr>
1 1 1 1 NA a
2 1 2 1 1 b
3 1 3 1 2 c
4 1 4 2 3 d
5 1 4 3 4 e
6 2 1 1 5 f
7 2 1 2 6 g
8 2 1 3 7 h
9 2 2 4 8 <NA>
10 2 3 5 NA <NA>
however, I'm looking for other variable/column selection possibilities inside tidyr fill().
Sample data:
df <- data.frame(ID = c(rep(1, 5), rep(2, 5)),
col_A_1 = c(1:4, NA, NA, NA, 1:3),
col_A_2 = c(NA, NA, 1:3, 1:5),
col_B_1 = c(NA, 1:8, NA),
col_B_2 = c(letters[1:8], NA, NA),
stringsAsFactors = FALSE)
The fill can take select_helpers
library(tidyverse)
df %>%
group_by(ID) %>%
fill(matches('A'), .direction = 'up') %>%
fill(matches('A'), .direction = 'down')
# A tibble: 10 x 5
# Groups: ID [2]
# ID col_A_1 col_A_2 col_B_1 col_B_2
# <dbl> <int> <int> <int> <chr>
# 1 1 1 1 NA a
# 2 1 2 1 1 b
# 3 1 3 1 2 c
# 4 1 4 2 3 d
# 5 1 4 3 4 e
# 6 2 1 1 5 f
# 7 2 1 2 6 g
# 8 2 1 3 7 h
# 9 2 2 4 8 <NA>
#10 2 3 5 NA <NA>

Resources