Apply a custom function after group_by using dplyr in R - r

How do I apply a function after group_by using dplyr to remove those groups with 2 or more consecutive NAs? I have written a function that outputs True or False whether a column in a dataframe has 2 or more NAs:
# function for determining if ts contains consecutive NAs
is.na.contiguous <- function(df, consecutive) {
na.rle <- rle(is.na(df$b))
na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
any(na.rle$values)
}
# example df
d = structure(list(a = c(1, 2, 3, 4, 5, 6, 7, 8), b = c(1, 2, 2,
+ NA, NA, 2, NA, 2), c = c(1, 1, 1, 2, 2, 2, 3, 3)), class = "data.frame", row.names = c(NA,
+ -8L))
head(d)
a b c
1 1 1 1
2 2 2 1
3 3 2 1
4 4 NA 2
5 5 NA 2
6 6 2 2
7 7 NA 3
8 8 2 3
# test function
is.na.contiguous(d,2)
TRUE # column b has 2 consecutive NAs
is.na.contiguous(d,3)
FALSE # column b does not have 3 consecutive NAs
Now how do I apply this function to each group in the dataframe? Below is what I have tried:
d %>% group_by(c) %>% mutate(consecNA = is.na.contiguous(.,2)) %>% as.data.frame()
a b c consecNA
1 1 1 1 TRUE
2 2 2 1 TRUE
3 3 2 1 TRUE
4 4 NA 2 TRUE
5 5 NA 2 TRUE
6 6 2 2 TRUE
7 7 NA 3 TRUE
8 8 2 3 TRUE
What am I doing wrong?

Instead of passing the entire dataframe to is.na.contiguous, pass only the column value then it would be simple to apply it via group and also it would become flexible if you want to do the same for some different column.
is.na.contiguous <- function(x, consecutive) {
na.rle <- rle(is.na(x))
na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
any(na.rle$values)
}
library(dplyr)
d %>%
group_by(c) %>%
filter(!is.na.contiguous(b, 2))
# a b c
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 2 2 1
#3 3 2 1
#4 7 NA 3
#5 8 2 3

An option would be to use rleid from data.table on the logical vector (is.na(b)), and use that to subset the groups having number of rows greater than or equal to 2 and if all the elements are NA
library(data.table)
i1 <- setDT(d)[, .I[!(.N >=2 & all(is.na(b)))], rleid(is.na(b))]$V1
d[i1]
#. a b c
#1: 1 1 1
#2: 2 2 1
#3: 3 2 1
#4: 6 2 2
#5: 7 NA 3
#6: 8 2 3
Or if we need to also group by 'c'
setDT(d)[d[, .I[sum(is.na(b)) <2], .(grp = rleid(is.na(b)), c)]$V1]
or with tidyverse
library(dplyr)
d %>%
group_by(grp = rleid(is.na(b))) %>%
filter(!(n() >=2 & all(is.na(b))))
# A tibble: 6 x 4
# Groups: grp [4]
# a b c grp
# <dbl> <dbl> <dbl> <int>
#1 1 1 1 1
#2 2 2 1 1
#3 3 2 1 1
#4 6 2 2 3
#5 7 NA 3 4
#6 8 2 3 5
Or another option is to get the sum of logical vector and check if it is less than 2
d %>%
group_by(c, grp = rleid(is.na(b))) %>%
filter(sum(is.na(b))<2)
If we are using the function from OP
is.na.contiguous <- function(x, consecutive) {
na.rle <- rle(is.na(x))
with(na.rle, any(values & na.rle$lengths >= consecutive))
}
d %>%
group_by(c) %>%
mutate(consecNA = is.na.contiguous(b, 2))
# A tibble: 8 x 4
# Groups: c [3]
# a b c consecNA
# <dbl> <dbl> <dbl> <lgl>
#1 1 1 1 FALSE
#2 2 2 1 FALSE
#3 3 2 1 FALSE
#4 4 NA 2 TRUE
#5 5 NA 2 TRUE
#6 6 2 2 TRUE
#7 7 NA 3 FALSE
#8 8 2 3 FALSE

Related

Nested list to grouped rows in R

I have the following nested list called l (dput below):
> l
$A
$A$`1`
[1] 1 2 3
$A$`2`
[1] 3 2 1
$B
$B$`1`
[1] 2 2 2
$B$`2`
[1] 3 4 3
I would like to convert this to a grouped dataframe where A and B are the first group column and 1 and 2 are the subgroups with respective values. The desired output should look like this:
group subgroup values
1 A 1 1
2 A 1 2
3 A 1 3
4 A 2 3
5 A 2 2
6 A 2 1
7 B 1 2
8 B 1 2
9 B 1 2
10 B 2 3
11 B 2 4
12 B 2 3
As you can see A and B are the main group and 1 and 2 are the subgroups. Using purrr::flatten(l) or unnest doesn't work. So I was wondering if anyone knows how to convert a nested list to a grouped row dataframe?
dput of l:
l <- list(A = list(`1` = c(1, 2, 3), `2` = c(3, 2, 1)), B = list(`1` = c(2,
2, 2), `2` = c(3, 4, 3)))
Using stack and rowbind with id:
data.table::rbindlist(lapply(l, stack), idcol = "id")
# id values ind
# 1: A 1 1
# 2: A 2 1
# 3: A 3 1
# 4: A 3 2
# 5: A 2 2
# 6: A 1 2
# 7: B 2 1
# 8: B 2 1
# 9: B 2 1
# 10: B 3 2
# 11: B 4 2
# 12: B 3 2
You can use enframe() to convert the list into a data.frame, and unnest the value column twice.
library(tidyr)
tibble::enframe(l, name = "group") %>%
unnest_longer(value, indices_to = "subgroup") %>%
unnest(value)
# A tibble: 12 × 3
group value subgroup
<chr> <dbl> <chr>
1 A 1 1
2 A 2 1
3 A 3 1
4 A 3 2
5 A 2 2
6 A 1 2
7 B 2 1
8 B 2 1
9 B 2 1
10 B 3 2
11 B 4 2
12 B 3 2
Turn the list directly into a data frame, then pivot it into a long format and arrange to your desired order.
library(tidyverse)
lst %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = c("group", "subgroup"),
values_to = "values",
names_pattern = "(.+?)\\.(.+?)") %>%
arrange(group, subgroup)
# A tibble: 12 × 3
group subgroup values
<chr> <chr> <dbl>
1 A 1 1
2 A 1 2
3 A 1 3
4 A 2 3
5 A 2 2
6 A 2 1
7 B 1 2
8 B 1 2
9 B 1 2
10 B 2 3
11 B 2 4
12 B 2 3
You can combine rrapply with unnest, which has the benefit to work in lists of arbitrary lengths:
library(rrapply)
library(tidyr)
rrapply(l, how = "melt") |>
unnest(value)
# A tibble: 12 × 3
L1 L2 value
<chr> <chr> <dbl>
1 A 1 1
2 A 1 2
3 A 1 3
4 A 2 3
5 A 2 2
6 A 2 1
7 B 1 2
8 B 1 2
9 B 1 2
10 B 2 3
11 B 2 4
12 B 2 3

How to conditionally update a R tibble using multiple conditions of another tibble

I have two tables. I would like to update the first table using a second table using multiple conditions. In base R I would use if...else type constructs to do this but would like to know how to achieve this using dplyr.
The table to be updated (have a field added) looks like this:
> Intvs
# A tibble: 12 x 3
Group From To
<chr> <dbl> <dbl>
1 A 0 1
2 A 1 2
3 A 2 3
4 A 3 4
5 A 4 5
6 A 5 6
7 B 0 1
8 B 1 2
9 B 2 3
10 B 3 4
11 B 4 5
12 B 5 6
The tibble that I would like to use to make the update looks like this:
>Zns
# A tibble: 2 x 4
Group From To Zone
<chr> <chr> <dbl> <dbl>
1 A X 1 5
2 B Y 3 4
I would like to update the Intvs tibble with the Zns tibble using the fields == Group, >= From, and <= To to control the update. The expected output should look like this
> Intvs
# A tibble: 12 x 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
What is the most efficient way to do this using dplyr?
The code below should make the dummy tables Intv and Zns
# load packages
require(tidyverse)
# Intervals table
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
names(Zns) <- c("Group", "From", "To", "Zone")
Using non-equi join from data.table
library(data.table)
setDT(Intvs)[Zns, Zone := Zone, on = .(Group, From >= From, To <= To)]
-output
> Intvs
Group From To Zone
<char> <num> <num> <char>
1: A 0 1 <NA>
2: A 1 2 X
3: A 2 3 X
4: A 3 4 X
5: A 4 5 X
6: A 5 6 <NA>
7: B 0 1 <NA>
8: B 1 2 <NA>
9: B 2 3 <NA>
10: B 3 4 Y
11: B 4 5 <NA>
12: B 5 6 <NA>
This is the closest I get. It is not giving the expected output:
library(dplyr)
left_join(Intvs, Zns, by="Group") %>%
group_by(Group) %>%
mutate(Zone1 = case_when(From.x <= Zone & From.x >= To.y ~ From.y)) %>%
select(Group, From=From.x, To=To.x, Zone = Zone1)
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 X
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 Y
12 B 5 6 NA
Not sure why your first row does not give NA, since 0 - 1 is not in the range of 1 - 5.
First left_join the two dataframes using the Group column. Here I assign the suffix "_Zns" to values from the Zns dataframe. Then use a single case_when or (ifelse) statement to assign NA to rows that do not fit the range. Finally, drop the columns that end with Zns.
library(dplyr)
left_join(Intvs, Zns, by = "Group", suffix = c("", "_Zns")) %>%
mutate(Zone = case_when(From >= From_Zns & To <= To_Zns ~ Zone,
TRUE ~ NA_character_)) %>%
select(-ends_with("Zns"))
# A tibble: 12 × 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
Data
Note that I have changed your column name order in the Zns dataframe.
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
colnames(Zns) <- c("Group", "Zone", "From", "To")

Roll max in R. From first row to current row

I would like to calculate max value from first row to current row
df <- data.frame(id = c(1,1,1,1,2,2,2), value = c(2,5,3,2,4,5,4), result = c(NA,2,5,5,NA,4,5))
I have tried grouping by id with dplyr and using rollmax function from zoo but did not success
1) rollmax is used with a fixed width but here we have a variable width so using rollapplyr, which seems close to the approach of the question, we have:
library(dplyr)
library(zoo)
df %>%
group_by(id) %>%
mutate(out = lag(rollapplyr(value, 1:n(), max))) %>%
ungroup
giving:
# A tibble: 7 x 4
# Groups: id [2]
id value result out
<dbl> <dbl> <dbl> <dbl>
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
2) It is also possible to perform the grouping via the width (second) argument of rollapplyr like this eliminating dplyr. In this case the widths are 1, 2, 3, 4, 1, 2, 3 and Max is like max except it does not use the last element of its argument x. (An alternate expression for the width would be seq_along(id) - match(id, id) + 1).
library(zoo)
Max <- function(x) if (length(x) == 1) NA else max(head(x, -1))
transform(df, out = rollapplyr(value, sequence(rle(id)$lengths), Max))
giving:
id value result out
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
A data.table option using shift + cummax
> setDT(df)[, result2 := shift(cummax(value)), id][]
id value result result2
1: 1 2 NA NA
2: 1 5 2 2
3: 1 3 5 5
4: 1 2 5 5
5: 2 4 NA NA
6: 2 5 4 4
7: 2 4 5 5
library(dplyr)
df |>
group_by(id) |>
mutate(result = lag(cummax(value)))
# # A tibble: 7 x 3
# # Groups: id [2]
# id value result
# <dbl> <dbl> <dbl>
# 1 1 2 NA
# 2 1 5 2
# 3 1 3 5
# 4 1 2 5
# 5 2 4 NA
# 6 2 5 4
# 7 2 4 5
Here is a base R solution. This would just get you the cumulative maximum:
df$result = ave(df$value, df$i, FUN=cummax)
To get the cumulative maximum with the lag you wanted:
df$result = ave(df$value, df$i, FUN=function(x) c(NA,cummax(x[-(length(x))])))

Group by cumulative sums with conditions

In this dataframe:
df <- data.frame(
ID = c("C", "B", "B", "B", NA, "C", "A", NA, "B", "B", "B")
)
I'd like to group the rows using cumsum with two conditions: (i) cumsum should not continue if is.na(ID) and (ii) it should not continue if the next ID value is the same as the prior. I do meet condition (i) with this:
df %>%
group_by(grp = cumsum(!is.na(ID)))
# A tibble: 11 x 2
# Groups: grp [9]
ID grp
<chr> <int>
1 C 1
2 B 2
3 B 3
4 B 4
5 NA 4
6 C 5
7 A 6
8 NA 6
9 B 7
10 B 8
11 B 9
but I don't know how to implement condition (ii) too, to obtain the desired result:
1 C 1
2 B 2
3 B 2
4 B 2
5 NA 2
6 C 3
7 A 4
8 NA 4
9 B 5
10 B 5
11 B 5
I tried it with this but I doesn't work:
df %>%
group_by(grp = cumsum(!is.na(ID) |!lag(ID,1) == ID))
Use na.locf0 from zoo to fill in the NAs and then apply rleid from data.table:
library(data.table)
library(zoo)
rleid(na.locf0(df$ID))
## [1] 1 2 2 2 2 3 4 4 5 5 5
Using tidyr and dplyr, you could do:
df %>%
mutate(grp = fill(., ID) %>% pull(),
grp = cumsum(grp != lag(grp, default = first(grp))))
ID grp
1 C 0
2 B 1
3 B 1
4 B 1
5 <NA> 1
6 C 2
7 A 3
8 <NA> 3
9 B 4
10 B 4
11 B 4
Using rle
library(zoo)
with(rle(na.locf0(df$ID)), rep(seq_along(values), lengths))
#[1] 1 2 2 2 2 3 4 4 5 5 5

is there a way in R to subtract two rows within a group by specifying another grouping var?

Say I have something like this:
ID = c("a","a","a","a","a", "b","b","b","b","b")
Group = c("1","2","3","4","5", "1","2","3","4","5")
Value = c(3, 4,2,4,3, 6, 1, 8, 9, 10)
df<-data.frame(ID,Group,Value)
I want to subtract group=5 from group=3 within the ID, with an output column which has this difference for each ID like so:
ID Group Value Want
1 a 1 3 1
2 a 2 4 1
3 a 3 2 1
4 a 4 4 1
5 a 5 3 1
6 b 1 6 2
7 b 2 1 2
8 b 3 8 2
9 b 4 9 2
10 b 5 10 2
Also, if that calculation cannot be done (i.e. group 5 is missing), NA values for the 'want' column would be ideal.
As there is only one unique 'Group' per 'ID', we can do subsetting
library(dplyr)
df %>%
group_by(ID) %>%
mutate(want = Value[Group == 5] - Value[Group == 3])
# A tibble: 10 x 4
# Groups: ID [2]
# ID Group Value want
# <fct> <fct> <dbl> <dbl>
# 1 a 1 3 1
# 2 a 2 4 1
# 3 a 3 2 1
# 4 a 4 4 1
# 5 a 5 3 1
# 6 b 1 6 2
# 7 b 2 1 2
# 8 b 3 8 2
# 9 b 4 9 2
#10 b 5 10 2
The above can be made more error-proof if we convert to numeric index and get the first element. When there are no TRUE, by using [1], it returns NA
df %>%
slice(-10) %>%
group_by(ID) %>%
mutate(want = Value[which(Group == 5)[1]] - Value[which(Group == 3)[1]])
Or use match which returns an index of NA if there are no matches, and anything with NA index returns NA which will subsequently return NA in subtraction (NA -3)
df %>%
slice(-10) %>% # removing the last row where Group is 10
group_by(ID) %>%
mutate(want = Value[match(5, Group)] - Value[match(3, Group)])
Here is a base R solution
dfout <- Reduce(rbind,
lapply(split(df,df$ID),
function(x) within(x, Want <-diff(subset(Value, Group %in% c("3","5"))))))
such that
> dfout
ID Group Value Want
1 a 1 3 1
2 a 2 4 1
3 a 3 2 1
4 a 4 4 1
5 a 5 3 1
6 b 1 6 2
7 b 2 1 2
8 b 3 8 2
9 b 4 9 2
10 b 5 10 2
A data.table method:
library(data.table)
setDT(df)[, want := (Value[Group == 5] - Value[Group == 3]), by = .(ID)]
df
# ID Group Value want
# 1: a 1 3 1
# 2: a 2 4 1
# 3: a 3 2 1
# 4: a 4 4 1
# 5: a 5 3 1
# 6: b 1 6 2
# 7: b 2 1 2
# 8: b 3 8 2
# 9: b 4 9 2
# 10: b 5 10 2
Here is a solution using base R.
unsplit(
lapply(
split(df, df$ID),
function(d) {
x5 = d$Value[d$Group == "5"]
x5 = ifelse(length(x5) == 1, x5, NA)
x3 = d$Value[d$Group == "3"]
x3 = ifelse(length(x3) == 1, x3, NA)
d$Want = x5 - x3
d
}),
df$ID)

Resources