Expand each group to the max n of rows - r

How can I expand a group to length of the max group:
df <- structure(list(ID = c(1L, 1L, 2L, 3L, 3L, 3L), col1 = c("A",
"B", "O", "U", "L", "R")), class = "data.frame", row.names = c(NA,
-6L))
ID col1
1 A
1 B
2 O
3 U
3 L
3 R
Desired Output:
1 A
1 B
NA NA
2 O
NA NA
NA NA
3 U
3 L
3 R

You can take advantage of the fact that df[n_bigger_than_nrow,] gives a row of NAs
dplyr
max_n <- max(count(df, ID)$n)
df %>%
group_by(ID) %>%
summarise(cur_data()[seq(max_n),])
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups: ID [3]
#> ID col1
#> <int> <chr>
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
base R
n <- tapply(df$ID, df$ID, length)
max_n <- max(n)
i <- lapply(n, \(x) c(seq(x), rep(Inf, max_n - x)))
i <- Map(`+`, i, c(0, cumsum(head(n, -1))))
df <- df[unlist(i),]
rownames(df) <- NULL
df$ID <- rep(as.numeric(names(i)), each = max_n)
df
#> ID col1
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R

Here's a base R solution.
split the df by the ID column, then use lapply to iterate over the split df, and rbind with a data frame of NA if there's fewer row than 3 (max(table(df$ID))).
do.call(rbind,
lapply(split(df, df$ID),
\(x) rbind(x, data.frame(ID = NA, col1 = NA)[rep(1, max(table(df$ID)) - nrow(x)), ]))
)
ID col1
1.1 1 A
1.2 1 B
1.3 NA <NA>
2.3 2 O
2.1 NA <NA>
2.1.1 NA <NA>
3.4 3 U
3.5 3 L
3.6 3 R

Here is a possible tidyverse solution. We can use add_row inside of summarise to add n number of rows to each group. I use max(count(df, ID)$n) to get the max group length, then I subtract that from the number of rows in each group to get the total number of rows that need to be added for each group. I use rep to produce the correct number of values that we need to add for each group. Finally, I replace ID with NA when there is an NA in col1.
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(add_row(cur_data(),
col1 = rep(NA_character_,
unique(max(count(df, ID)$n) - n()))),
.groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Output
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Or another option without using add_row:
library(dplyr)
# Get maximum number of rows for all groups
N = max(count(df,ID)$n)
df %>%
group_by(ID) %>%
summarise(col1 = c(col1, rep(NA, N-length(col1))), .groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))

Another option could be:
df %>%
group_split(ID) %>%
map_dfr(~ rows_append(.x, tibble(col1 = rep(NA_character_, max(pull(count(df, ID), n)) - group_size(.x)))))
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R

A base R using merge + rle
merge(
transform(
data.frame(ID = with(rle(df$ID), rep(values, each = max(lengths)))),
q = ave(ID, ID, FUN = seq_along)
),
transform(
df,
q = ave(ID, ID, FUN = seq_along)
),
all = TRUE
)[-2]
gives
ID col1
1 1 A
2 1 B
3 1 <NA>
4 2 O
5 2 <NA>
6 2 <NA>
7 3 U
8 3 L
9 3 R
A data.table option may also work
> setDT(df)[, .(col1 = `length<-`(col1, max(df[, .N, ID][, N]))), ID]
ID col1
1: 1 A
2: 1 B
3: 1 <NA>
4: 2 O
5: 2 <NA>
6: 2 <NA>
7: 3 U
8: 3 L
9: 3 R

An option to tidyr::complete the ID and row_new, using row_old to replace ID with NA.
library (tidyverse)
df %>%
group_by(ID) %>%
mutate(
row_new = row_number(),
row_old = row_number()) %>%
ungroup() %>%
complete(ID, row_new) %>%
mutate(ID = if_else(is.na(row_old),
NA_integer_,
ID)) %>%
select(-matches("row_"))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA <NA>
4 2 O
5 NA <NA>
6 NA <NA>
7 3 U
8 3 L
9 3 R

n <- max(table(df$ID))
df %>%
group_by(ID) %>%
summarise(col1 =`length<-`(col1, n), .groups = 'drop') %>%
mutate(ID = `is.na<-`(ID, is.na(col1)))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R

Another base R solution using sequence.
print(
df[
sequence(
abs(rep(i <- rle(df$ID)$lengths, each = 2) - c(0L, max(i))),
rep(cumsum(c(1L, i))[-length(i) - 1L], each = 2) + c(0L, nrow(df)),
),
],
row.names = FALSE
)
#> ID col1
#> 1 A
#> 1 B
#> NA <NA>
#> 2 O
#> NA <NA>
#> NA <NA>
#> 3 U
#> 3 L
#> 3 R

Related

R replace last nth value with NA by group

I want to replace value(s) with NA by group.
have <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,4,5,6,7))
want1 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,NA,5,6,NA))
want2 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,NA,NA,5,NA,NA))
want1 corresponds to replacing the last obs of value with NA and want2 corresponds to replacing last obs of value & last 2nd value with NA. I'm currently trying to do with with dplyr package but can't seem to get any traction. Any help would be much appreciated. Thanks!
We can use row_number() to test the current row against n() the total rows in the group.
have |>
group_by(id) |>
mutate(
last1 = ifelse(row_number() == n(), NA, value),
last2 = ifelse(row_number() >= n() - 1, NA, value)
)
# # A tibble: 7 × 4
# # Groups: id [2]
# id value last1 last2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1
# 2 1 2 2 2
# 3 1 3 3 NA
# 4 1 4 NA NA
# 5 2 5 5 5
# 6 2 6 6 NA
# 7 2 7 NA NA
And a general way to provide variants as different data frames.
lapply(
1:2,
function(k) {
have %>%
group_by(id) %>%
mutate(value=ifelse(row_number() <= (n() - k), value, NA))
}
)
[[1]]
# A tibble: 7 × 2
# Groups: id [2]
id value
<dbl> <dbl>
1 1 1
2 1 2
3 1 3
4 1 NA
5 2 5
6 2 6
7 2 NA
[[2]]
# A tibble: 7 × 2
# Groups: id [2]
id value
<dbl> <dbl>
1 1 1
2 1 2
3 1 NA
4 1 NA
5 2 5
6 2 NA
7 2 NA
Here is a base R way.
have <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,4,5,6,7))
want1 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,NA,5,6,NA))
want2 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,NA,NA,5,NA,NA))
with(have, ave(value, id, FUN = \(x){
x[length(x)] <- NA
x
}))
#> [1] 1 2 3 NA 5 6 NA
with(have, ave(value, id, FUN = \(x){
x[length(x)] <- NA
if(length(x) > 1)
x[length(x) - 1L] <- NA
x
}))
#> [1] 1 2 NA NA 5 NA NA
Created on 2022-06-09 by the reprex package (v2.0.1)
Then reassign these results to column value.

If a column is NA, calculate row mean on other columns using dplyR

In the example below how can I calculate the row mean when column A is NA? The row mean would replace the NA in column A. Using base R, I can use this:
foo <- tibble(A = c(3,5,NA,6,NA,7,NA),
B = c(4,5,4,5,6,4,NA),
C = c(6,5,2,8,8,5,NA))
foo
tmp <- rowMeans(foo[,-1],na.rm = TRUE)
foo$A[is.na(foo$A)] <- tmp[is.na(foo$A)]
foo$A[is.nan(foo$A)] <- NA
Curious how I might do this with dplyR?
You can use ifelse :
library(dplyr)
foo %>%
mutate(A = ifelse(is.na(A), rowMeans(., na.rm = TRUE), A),
A = replace(A, is.nan(A), NA))
# A B C
# <dbl> <dbl> <dbl>
#1 3 4 6
#2 5 5 5
#3 3 4 2
#4 6 5 8
#5 7 6 8
#6 7 4 5
#7 NA NA NA
Here is a solution that not only replace NA in column A, but for all columns in the data frame.
library(dplyr)
foo2 <- foo %>%
mutate(RowMean = rowMeans(., na.rm = TRUE)) %>%
mutate(across(-RowMean, .fns =
function(x) ifelse(is.na(x) & !is.nan(RowMean), RowMean, x))) %>%
select(-RowMean)
Use coalesce:
foo %>%
mutate(m = rowMeans(across(), na.rm = T),
A = if_else(is.na(A) & !is.na(m), m, A)) %>%
select(-m)
# # A tibble: 7 x 3
# A B C
# <dbl> <dbl> <dbl>
# 1 3 4 6
# 2 5 5 5
# 3 3 4 2
# 4 6 5 8
# 5 7 6 8
# 6 7 4 5
# 7 NA NA NA

How to calculate row differences in r when it's not in sequence

I have a data frame like this:
name count
a 3
a 5
a 8
b 2
a 9
b 7
so I want to calculate the row differences group by name. so my code is:
data%>%group_by(Name)%>%mutate(last_count = lag(count),diff = count - last_count)
However, I get a result like the below table
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 8 1
b 7 2 5
But what I want should look like this:
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 NA NA
b 7 NA NA
Thanks in advance to whoever can help me fix it!
Does this work:
> library(dplyr)
> df %>% mutate(last_count = case_when(name == lag(name) ~ lag(count), TRUE ~ NA_real_),
diff = case_when(name == lag(name) ~ count - lag(count), TRUE ~ NA_real_))
# A tibble: 6 x 4
name count last_count diff
<chr> <dbl> <dbl> <dbl>
1 a 3 NA NA
2 a 5 3 2
3 a 8 5 3
4 b 2 NA NA
5 a 9 NA NA
6 b 7 NA NA
>
We could use rleid to create a grouping column based on the adjacent matching values in the 'name' column and then apply the diff
library(dplyr)
library(data.table)
data %>%
group_by(grp = rleid(name)) %>%
mutate(last_count = lag(count), diff = count - last_count) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 6 x 4
# name count last_count diff
# <chr> <int> <int> <int>
#1 a 3 NA NA
#2 a 5 3 2
#3 a 8 5 3
#4 b 2 NA NA
#5 a 9 NA NA
#6 b 7 NA NA
Or using base R with ave and rle
data$diff <- with(data, ave(count, with(rle(name),
rep(seq_along(values), lengths)), FUN = function(x) c(NA, diff(x)))
data
data <- structure(list(name = c("a", "a", "a", "b", "a", "b"), count = c(3L,
5L, 8L, 2L, 9L, 7L)), class = "data.frame", row.names = c(NA,
-6L))

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

NA filling only if "sandwiched" by the same value using dplyr

Ok, here is yet another missing value filling question.
I am looking for a way to fill NAs based on both the previous and next existent values in a column. Standard filling in a single direction is not sufficient for this task.
If the previous and next valid values in a column are not the same, then the chunk remains as NA.
The code for the sample data frame is:
df_in <- tibble(id= 1:12,
var1 = letters[1:12],
var2 = c(NA,rep("A",2),rep(NA,2),rep("A",2),rep(NA,2),rep("B",2),NA))
Thanks,
Comparing na.locf() (last observation carried forward) and na.locf(fromLast = TRUE) (backward):
mutate(df_in,
var_new = if_else(
zoo::na.locf(var2, na.rm = FALSE) ==
zoo::na.locf(var2, na.rm = FALSE, fromLast = TRUE),
zoo::na.locf(var2, na.rm = FALSE),
NA_character_
))
# # A tibble: 12 x 4
# id var1 var2 var_new
# <int> <chr> <chr> <chr>
# 1 1 a NA NA
# 2 2 b A A
# 3 3 c A A
# 4 4 d NA A
# 5 5 e NA A
# 6 6 f A A
# 7 7 g A A
# 8 8 h NA NA
# 9 9 i NA NA
# 10 10 j B B
# 11 11 k B B
# 12 12 l NA NA
Something like this?
df_in %>% mutate(var_new = {
tmp <- var2
tmp[is.na(tmp)] <- "NA"
rl <- rle(tmp)
tibble(before = c(NA, head(rl$values, -1)),
value = rl$values,
after = c(tail(rl$values, -1), NA),
lengths = rl$lengths) %>%
mutate(value = ifelse(value == "NA" & before == after, before, value),
value = ifelse(value == "NA", NA, value)) %>%
select(value, lengths) %>%
unname() %>%
do.call(rep, .)})
# # A tibble: 12 x 4
# id var1 var2 var_new
# <int> <chr> <chr> <chr>
# 1 1 a NA <NA>
# 2 2 b A A
# 3 3 c A A
# 4 4 d NA A
# 5 5 e NA A
# 6 6 f A A
# 7 7 g A A
# 8 8 h NA <NA>
# 9 9 i NA <NA>
# 10 10 j B B
# 11 11 k B B
# 12 12 l NA <NA>
Explanation
Convert NA to "NA" (because rle does not count consecutive NA.)
Create a run length encoded representation of tmp
Now you cna have a look at values beofre and after the relevant blocks
Replace the values.

Resources