how can i group_by NA's as well? - r

with this formula:
datanew <- df_bsp %>%
group_by(id_mother) %>%
dplyr::mutate(Family = cur_group_id())
I got this output:
datanew <- data.frame(id_pers=c(1, 2, 3, 4, 5, 6),
id_mother=c(11, 11, 11, 12, 12, 12),
FAMILY=c(1,1,1,2,2,2)
now the problem:
There are also some NA's in the id_mother-variable
it looks like this:
datanew_1 <- data.frame(id_pers=c(1, 2, 3, 4, 5, 6, 7, 8, 9,10),
id_mother=c(11, 11, 11, 12, 12, 12, NA, NA, NA, NA)
How can i get this result:
datanew <- data.frame(id_pers=c(1, 2, 3, 4, 5, 6, 7, 8, 9,10),
id_mother=c(11, 11, 11, 12, 12, 12, NA, NA, NA, NA),
FAMILY=c(1,1,1,2,2,2,3,4,5,6)
THX

If you want each NA value treated as its own group, give each one a unique value:
datanew_1 %>%
mutate(
id_mother_na = ifelse(
is.na(id_mother),
paste("g", "na", cumsum(is.na(id_mother))),
paste("g", id_mother)
)
) %>%
group_by(id_mother_na) %>%
mutate(Family = cur_group_id()) %>%
ungroup()
# # A tibble: 10 × 4
# id_pers id_mother id_mother_na Family
# <dbl> <dbl> <chr> <int>
# 1 1 11 g 11 1
# 2 2 11 g 11 1
# 3 3 11 g 11 1
# 4 4 12 g 12 2
# 5 5 12 g 12 2
# 6 6 12 g 12 2
# 7 7 NA g na 1 3
# 8 8 NA g na 2 4
# 9 9 NA g na 3 5
# 10 10 NA g na 4 6

Along the same lines of the other answer, you need to make a unique group for the NA:
library(tidyverse)
make_grp <- function(x){
coalesce(x, cumsum(is.na(x))) + (max(x, na.rm = TRUE)*is.na(x))
}
datanew_1 |>
group_by(grp = make_grp(id_mother)) |>
mutate(Family = cur_group_id()) |>
ungroup() |>
select(-grp)
#> # A tibble: 10 x 3
#> id_pers id_mother Family
#> <dbl> <dbl> <int>
#> 1 1 11 1
#> 2 2 11 1
#> 3 3 11 1
#> 4 4 12 2
#> 5 5 12 2
#> 6 6 12 2
#> 7 7 NA 3
#> 8 8 NA 4
#> 9 9 NA 5
#> 10 10 NA 6

Related

How to add rows so that each group has equal number of rows?

I have a data frame with unequal numbers of rows per group, see df in the example below. I would like to add rows containing the group name and NAs in all other columns so that there is an equal number of rows per group like in df.desired. The rows should be added after the last row from the respective group.
Example:
df = data.frame(group = c("A","A","A","A","B","B","B","C","C"),
col1 = c(1, 1, 1, 1, 2, 2, 2, 3, 3),
col2 = c(12, 13, 14, 15, 21, 22, 23, 31, 32))
> df
group col1 col2
1 A 1 12
2 A 1 13
3 A 1 14
4 A 1 15
5 B 2 21
6 B 2 22
7 B 2 23
8 C 3 31
9 C 3 32
df.desired = data.frame(group = c("A","A","A","A","B","B","B","B","C","C","C","C"),
col1 = c(1, 1, 1, 1, 2, 2, 2, NA, 3, 3, NA, NA),
col2 = c(12, 13, 14, 15, 21, 22, 23, NA, 31, 32, NA, NA))
> df.desired
group col1 col2
1 A 1 12
2 A 1 13
3 A 1 14
4 A 1 15
5 B 2 21
6 B 2 22
7 B 2 23
8 B NA NA
9 C 3 31
10 C 3 32
11 C NA NA
12 C NA NA
I know how to do this with a loop but that would be super slow and I would prefer to use dplyr if possible. Does anyone have any ideas?
How about this:
library(dplyr)
df = data.frame(group = c("A","A","A","A","B","B","B","C","C"),
col1 = c(1, 1, 1, 1, 2, 2, 2, 3, 3),
col2 = c(12, 13, 14, 15, 21, 22, 23, 31, 32))
maxgp <- max(table(df$group))
df %>%
group_by(group) %>%
summarise(across(everything(), ~c(.x, rep(NA, maxgp-n()))))
#> `summarise()` has grouped output by 'group'. You can override using the
#> `.groups` argument.
#> # A tibble: 12 × 3
#> # Groups: group [3]
#> group col1 col2
#> <chr> <dbl> <dbl>
#> 1 A 1 12
#> 2 A 1 13
#> 3 A 1 14
#> 4 A 1 15
#> 5 B 2 21
#> 6 B 2 22
#> 7 B 2 23
#> 8 B NA NA
#> 9 C 3 31
#> 10 C 3 32
#> 11 C NA NA
#> 12 C NA NA
Created on 2023-02-01 by the reprex package (v2.0.1)
You can create row numbers for each group and then tidyr::complete:
library(dplyr)
df %>%
group_by(group) %>%
mutate(id = row_number()) %>%
ungroup() %>%
tidyr::complete(group, id) %>%
select(-id)
# # A tibble: 12 × 3
# group col1 col2
# <chr> <dbl> <dbl>
# 1 A 1 12
# 2 A 1 13
# 3 A 1 14
# 4 A 1 15
# 5 B 2 21
# 6 B 2 22
# 7 B 2 23
# 8 B NA NA
# 9 C 3 31
# 10 C 3 32
# 11 C NA NA
# 12 C NA NA
Update (from #Maël's answer)
After dplyr 1.1.0, Per-operation grouping with .by/by is supported for mutate(), summarise(), filter(), and the slice() family. The code can be simplified to
df %>%
mutate(id = row_number(), .by = group) %>%
tidyr::complete(group, id) %>%
select(-id)

Modify variables in longitudinal data sets (keep first appearance of values on person-level)

I have a dataframe:
i <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3)
t <- c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4)
x <- c(0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1)
y <- c(5, 6, 7, 8, 4, 5, 6, 7, 6, 7, 8, 8)
j1 <- c(NA, NA, NA, NA, NA, 5, NA, 7, NA, NA, 8, 8)
dat <- data.frame(i, t, x, y, j1)
dat
i t x y j1
1 1 1 0 5 NA
2 1 2 0 6 NA
3 1 3 0 7 NA
4 1 4 0 8 NA
5 2 1 0 4 NA
6 2 2 1 5 5
7 2 3 0 6 NA
8 2 4 1 7 7
9 3 1 0 6 NA
10 3 2 0 7 NA
11 3 3 1 8 8
12 3 4 1 9 8
The dataframe refers to 3 persons "i" at 4 points in time "t". "j1" switches to "y" when "x" turns from 0 to 1 for a person "i". While "x" stays on 1 for a person, "j1" does not change within time (see person 3). When "x" is 0, "j1" is always NA.
Now I want to add a new variable "j2" to the dataframe which is a modification of "j1". The difference should be the following: For each person "i", there should be only one value for "j2". Namely, it should be the first value for "j1" for each person (the first change from 0 to 1 in "x").
Accordingly, the result should look like this:
dat
i t x y j1 j2
1 1 1 0 5 NA NA
2 1 2 0 6 NA NA
3 1 3 0 7 NA NA
4 1 4 0 8 NA NA
5 2 1 0 4 NA NA
6 2 2 1 5 5 5
7 2 3 0 6 NA NA
8 2 4 1 7 7 NA
9 3 1 0 6 NA NA
10 3 2 0 7 NA NA
11 3 3 1 8 8 8
12 3 4 1 9 8 NA
I appreciate suggestions on how to address this with dplyr
Somewhat more concise than the others:
library(tidyverse)
dat <- structure(list(i = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), t = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4), x = c(0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1), y = c(5, 6, 7, 8, 4, 5, 6, 7, 6, 7, 8, 8), j1 = c(NA, NA, NA, NA, NA, 5, NA, 7, NA, NA, 8, 8)), class = "data.frame", row.names = c(NA, -12L))
dat %>%
group_by(i) %>%
mutate(j2 = ifelse(1:n() == which(x == 1)[1], y, NA)) %>%
ungroup()
#> # A tibble: 12 × 6
#> i t x y j1 j2
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 5 NA NA
#> 2 1 2 0 6 NA NA
#> 3 1 3 0 7 NA NA
#> 4 1 4 0 8 NA NA
#> 5 2 1 0 4 NA NA
#> 6 2 2 1 5 5 5
#> 7 2 3 0 6 NA NA
#> 8 2 4 1 7 7 NA
#> 9 3 1 0 6 NA NA
#> 10 3 2 0 7 NA NA
#> 11 3 3 1 8 8 8
#> 12 3 4 1 8 8 NA
possible solution
library(tidyverse)
i <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3)
t <- c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4)
x <- c(0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1)
y <- c(5, 6, 7, 8, 4, 5, 6, 7, 6, 7, 8, 8)
j1 <- c(NA, NA, NA, NA, NA, 5, NA, 7, NA, NA, 8, 8)
df <- data.frame(i, t, x, y, j1)
tmp <- df %>%
filter(x == 1) %>%
group_by(i) %>%
slice(1) %>%
ungroup() %>%
rename(j2 = j1)
left_join(df, tmp)
#> Joining, by = c("i", "t", "x", "y")
#> i t x y j1 j2
#> 1 1 1 0 5 NA NA
#> 2 1 2 0 6 NA NA
#> 3 1 3 0 7 NA NA
#> 4 1 4 0 8 NA NA
#> 5 2 1 0 4 NA NA
#> 6 2 2 1 5 5 5
#> 7 2 3 0 6 NA NA
#> 8 2 4 1 7 7 NA
#> 9 3 1 0 6 NA NA
#> 10 3 2 0 7 NA NA
#> 11 3 3 1 8 8 8
#> 12 3 4 1 8 8 NA
Created on 2021-09-08 by the reprex package (v2.0.1)
Function f puts NA after first value that is not NA in vector x. FUnction f is applied to j1 for each group determined by i.
f <- function(x){
ind <- which(!is.na(x))[1]
if(is.na(ind) || ind == length(x)) return(x)
x[(which.min(is.na(x))+1):length(x)] <- NA
x
}
dat %>%
group_by(i) %>%
mutate(j2 = f(j1)) %>%
ungroup()
Option1
You can use dplyr with mutate, use j1 and replace()the values for which both the current and the previous (lag()) value are non-NA with NAs:
library(dplyr)
dat %>% group_by(i) %>%
mutate(j2=replace(j1, !is.na(j1) & !is.na(lag(j1)), NA))
Option2
You can use replace() and replace all values in j1 which are not the first non-NA value (which(!is.na(j1))[1]).
dat %>% group_by(i) %>%
mutate(j2=replace(j1, which(!is.na(j1))[1], NA))
Option3
You can use purrr::accumulate() too. Call accumulate comparing consecutive (.x, .y) values form the j1 vector. If they are the same, the output will be NA.
library(dplyr)
dat %>% group_by(i) %>%
mutate(j2=purrr::accumulate(j1, ~ifelse(.x %in% .y, NA, .y)))
Output
# A tibble: 12 x 6
# Groups: i [3]
i t x y j1 j2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 5 NA NA
2 1 2 0 6 NA NA
3 1 3 0 7 NA NA
4 1 4 0 8 NA NA
5 2 1 0 4 NA NA
6 2 2 1 5 5 5
7 2 3 0 6 NA NA
8 2 4 1 7 7 7
9 3 1 0 6 NA NA
10 3 2 0 7 NA NA
11 3 3 1 8 8 8
12 3 4 1 8 8 NA

How to reduce & melt dataframes inside a list together

UPDATED:
I have dataframes inside a list that looks like this:
v1 <- data.frame(time = c(1, 3, 5, 9, 33),
X = c(4, 3, 2, 3, 12),
SE = c(1, 2, 3, 2, 1))
v2 <- data.frame(time = c(1, 3, 5, 9, 33),
Y = c(12, 3, NA, 2, 4),
SE = c(1, 2, 1, 12, 3))
list <- list(v1, v2)
I want to melt/reduce it to look like this:
time variable value SE
1 1 X 4 1
2 3 X 3 2
3 5 X 2 3
4 9 X 3 2
5 33 X 12 1
6 1 Y 12 1
7 3 Y 3 2
8 5 Y NA 1
9 9 Y 2 12
10 33 Y 4 3
So far I've tried this code to no avail.
data <- list %>% reduce(full_join, by = "time")
data2 <- melt(data, id = c("time"))
Thank you!
We can use melt with patterns
library(data.table)
melt(setDT(data), measure = patterns('X|Y', 'SE'),
value.name = c('value', 'SE'))[, variable := c("X", "Y")[variable]][]
# time variable value SE
# 1: 1 X 4 1
# 2: 3 X 3 2
# 3: 5 X 2 3
# 4: 9 X 3 2
# 5: 33 X 12 1
# 6: 1 Y 12 1
# 7: 3 Y 3 2
# 8: 5 Y NA 1
# 9: 9 Y 2 12
#10: 33 Y 4 3
Or using pivot_longer
library(tidyr)
library(stringr)
library(dplyr)
data %>%
rename_at(vars(X, Y), ~ str_c('value.', tolower(.))) %>%
pivot_longer(cols = -time, names_to = c('.value', 'variable'),
names_sep="\\.", values_drop_na = TRUE) %>%
arrange(variable)
# A tibble: 10 x 4
# time variable value SE
# <dbl> <chr> <dbl> <dbl>
# 1 1 x 4 1
# 2 3 x 3 2
# 3 5 x 2 3
# 4 9 x 3 2
# 5 33 x 12 1
# 6 1 y 12 1
# 7 3 y 3 2
# 8 5 y NA 1
# 9 9 y 2 12
#10 33 y 4 3

eliminating categories with a certain number of non-NA values in R

I have a data frame df which looks like this
> g <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6)
> m <- c(1, NA, NA, NA, 3, NA, 2, 1, 3, NA, 3, NA, NA, 4, NA, NA, NA, 2, 1, NA, 7, 3, NA, 1)
> df <- data.frame(g, m)
where g is the category (1 to 6) and m are values in that category.
I've managed to find the amount of none NA values per category by :
aggregate(m ~ g, data=df, function(x) {sum(!is.na(x))}, na.action = NULL)
g m
1 1 1
2 2 3
3 3 2
4 4 1
5 5 2
6 6 3
and would now like to eliminate the rows (categories) where the number of None-NA is 1 and only keep those where the number of NA is 2 and above.
the desired outcome would be
g m
5 2 3
6 2 NA
7 2 2
8 2 1
9 3 3
10 3 NA
11 3 3
12 3 NA
17 5 NA
18 5 2
19 5 1
20 5 NA
21 6 7
22 6 3
23 6 NA
24 6 1
every g=1 and g=4 is eliminated because as shown there is only 1 none-NA in each of those categories
any suggestions :)?
If you want base R, then I suggest you use your aggregation:
df2 <- aggregate(m ~ g, data=df, function(x) {sum(!is.na(x))}, na.action = NULL)
df[ ! df$g %in% df2$g[df2$m < 2], ]
# g m
# 5 2 3
# 6 2 NA
# 7 2 2
# 8 2 1
# 9 3 3
# 10 3 NA
# 11 3 3
# 12 3 NA
# 17 5 NA
# 18 5 2
# 19 5 1
# 20 5 NA
# 21 6 7
# 22 6 3
# 23 6 NA
# 24 6 1
If you want to use dplyr, perhaps
library(dplyr)
group_by(df, g) %>%
filter(sum(!is.na(m)) > 1) %>%
ungroup()
# # A tibble: 16 × 2
# g m
# <dbl> <dbl>
# 1 2 3
# 2 2 NA
# 3 2 2
# 4 2 1
# 5 3 3
# 6 3 NA
# 7 3 3
# 8 3 NA
# 9 5 NA
# 10 5 2
# 11 5 1
# 12 5 NA
# 13 6 7
# 14 6 3
# 15 6 NA
# 16 6 1
One can try a dplyr based solution. group_by on g will help to get the desired count.
library(dplyr)
df %>% group_by(g) %>%
filter(!is.na(m)) %>%
filter(n() >=2) %>%
summarise(count = n())
#Result
# # A tibble: 6 x 2
# g count
# <dbl> <int>
# 1 2.00 3
# 2 3.00 2
# 3 5.00 2
# 4 6.00 3

r group lag sum

I have some data with groups for which I want to compute a summary (sum or mean) over a fixed number of periods. I'm trying to do this with a group_by followed by mutate and then operating with the variable and its dplyr::lag. Here is an example:
library(tidyverse)
df <- data.frame(group = rep(c("A", "B"), 5),
x = c(1, 3, 4, 7, 9, 10, 17, 29, 30, 55))
df %>%
group_by(group) %>%
mutate(cs = x + lag(x, 1, 0) + lag(x, 2, 0) + lag(x, 3, 0)) %>%
ungroup()
Which yields the desired result:
# A tibble: 10 x 3
group x cs
<fctr> <dbl> <dbl>
1 A 1 1
2 B 3 3
3 A 4 5
4 B 7 10
5 A 9 14
6 B 10 20
7 A 17 31
8 B 29 49
9 A 30 60
10 B 55 101
Is there a shorter way to accomplish this? (Here I calculated four values but I actually need twelve or more).
Perhaps you could use the purrr functions reduce and map included with the tidyverse:
library(tidyverse)
df <- data.frame(group = rep(c("A", "B"), 5),
x = c(1, 3, 4, 7, 9, 10, 17, 29, 30, 55))
df %>%
group_by(group) %>%
mutate(cs = reduce(map(0:3, ~ lag(x, ., 0)), `+`)) %>%
ungroup()
#> # A tibble: 10 x 3
#> group x cs
#> <fctr> <dbl> <dbl>
#> 1 A 1 1
#> 2 B 3 3
#> 3 A 4 5
#> 4 B 7 10
#> 5 A 9 14
#> 6 B 10 20
#> 7 A 17 31
#> 8 B 29 49
#> 9 A 30 60
#> 10 B 55 101
To see what's happening here it's probably easier to see with a simpler example that doesn't require a group.
v <- 1:5
lagged_v <- map(0:3, ~ lag(v, ., 0))
lagged_v
#> [[1]]
#> [1] 1 2 3 4 5
#>
#> [[2]]
#> [1] 0 1 2 3 4
#>
#> [[3]]
#> [1] 0 0 1 2 3
#>
#> [[4]]
#> [1] 0 0 0 1 2
reduce(lagged_v, `+`)
#> [1] 1 3 6 10 14

Resources