I have data frame where some of the values are missing
A 1
A NA
A NA
B NA
B 2
B NA
C NA
C NA
C NA
How can I fill in groups where I have data?
You can also use fill from tidyr:
library(dplyr)
library(tidyr)
df1 %>%
group_by(ID) %>%
fill(v1) %>%
fill(v1, .direction = "up")
Result:
# A tibble: 9 x 2
# Groups: ID [3]
ID v1
<chr> <int>
1 A 1
2 A 1
3 A 1
4 B 2
5 B 2
6 B 2
7 C NA
8 C NA
9 C NA
Credits to #akrun for dput
Alternative solution, though perhaps a bit flawed in how many assumptions it makes:
library(dplyr)
y %>%
group_by(V1) %>%
arrange(V2) %>%
mutate(V2 = V2[1])
# Source: local data frame [9 x 2]
# Groups: V1 [3]
# V1 V2
# (chr) (int)
# 1 A 1
# 2 A 1
# 3 A 1
# 4 B 2
# 5 B 2
# 6 B 2
# 7 C NA
# 8 C NA
# 9 C NA
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', we assign (:=) the column 'v1' as the first non-NA value.
library(data.table)
setDT(df1)[, v1:= v1[!is.na(v1)][1L] , by = ID]
df1
# ID v1
#1: A 1
#2: A 1
#3: A 1
#4: B 2
#5: B 2
#6: B 2
#7: C NA
#8: C NA
#9: C NA
Or using only base R
with(df1, ave(v1, ID, FUN = function(x)
replace(x, is.na(x), x[!is.na(x)][1L])))
#[1] 1 1 1 2 2 2 NA NA NA
data
df1 <- structure(list(ID = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), v1 = c(1L, NA, NA, NA, 2L, NA, NA, NA, NA)), .Names = c("ID",
"v1"), class = "data.frame", row.names = c(NA, -9L))
Related
How can I expand a group to length of the max group:
df <- structure(list(ID = c(1L, 1L, 2L, 3L, 3L, 3L), col1 = c("A",
"B", "O", "U", "L", "R")), class = "data.frame", row.names = c(NA,
-6L))
ID col1
1 A
1 B
2 O
3 U
3 L
3 R
Desired Output:
1 A
1 B
NA NA
2 O
NA NA
NA NA
3 U
3 L
3 R
You can take advantage of the fact that df[n_bigger_than_nrow,] gives a row of NAs
dplyr
max_n <- max(count(df, ID)$n)
df %>%
group_by(ID) %>%
summarise(cur_data()[seq(max_n),])
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups: ID [3]
#> ID col1
#> <int> <chr>
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
base R
n <- tapply(df$ID, df$ID, length)
max_n <- max(n)
i <- lapply(n, \(x) c(seq(x), rep(Inf, max_n - x)))
i <- Map(`+`, i, c(0, cumsum(head(n, -1))))
df <- df[unlist(i),]
rownames(df) <- NULL
df$ID <- rep(as.numeric(names(i)), each = max_n)
df
#> ID col1
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
Here's a base R solution.
split the df by the ID column, then use lapply to iterate over the split df, and rbind with a data frame of NA if there's fewer row than 3 (max(table(df$ID))).
do.call(rbind,
lapply(split(df, df$ID),
\(x) rbind(x, data.frame(ID = NA, col1 = NA)[rep(1, max(table(df$ID)) - nrow(x)), ]))
)
ID col1
1.1 1 A
1.2 1 B
1.3 NA <NA>
2.3 2 O
2.1 NA <NA>
2.1.1 NA <NA>
3.4 3 U
3.5 3 L
3.6 3 R
Here is a possible tidyverse solution. We can use add_row inside of summarise to add n number of rows to each group. I use max(count(df, ID)$n) to get the max group length, then I subtract that from the number of rows in each group to get the total number of rows that need to be added for each group. I use rep to produce the correct number of values that we need to add for each group. Finally, I replace ID with NA when there is an NA in col1.
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(add_row(cur_data(),
col1 = rep(NA_character_,
unique(max(count(df, ID)$n) - n()))),
.groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Output
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Or another option without using add_row:
library(dplyr)
# Get maximum number of rows for all groups
N = max(count(df,ID)$n)
df %>%
group_by(ID) %>%
summarise(col1 = c(col1, rep(NA, N-length(col1))), .groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Another option could be:
df %>%
group_split(ID) %>%
map_dfr(~ rows_append(.x, tibble(col1 = rep(NA_character_, max(pull(count(df, ID), n)) - group_size(.x)))))
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
A base R using merge + rle
merge(
transform(
data.frame(ID = with(rle(df$ID), rep(values, each = max(lengths)))),
q = ave(ID, ID, FUN = seq_along)
),
transform(
df,
q = ave(ID, ID, FUN = seq_along)
),
all = TRUE
)[-2]
gives
ID col1
1 1 A
2 1 B
3 1 <NA>
4 2 O
5 2 <NA>
6 2 <NA>
7 3 U
8 3 L
9 3 R
A data.table option may also work
> setDT(df)[, .(col1 = `length<-`(col1, max(df[, .N, ID][, N]))), ID]
ID col1
1: 1 A
2: 1 B
3: 1 <NA>
4: 2 O
5: 2 <NA>
6: 2 <NA>
7: 3 U
8: 3 L
9: 3 R
An option to tidyr::complete the ID and row_new, using row_old to replace ID with NA.
library (tidyverse)
df %>%
group_by(ID) %>%
mutate(
row_new = row_number(),
row_old = row_number()) %>%
ungroup() %>%
complete(ID, row_new) %>%
mutate(ID = if_else(is.na(row_old),
NA_integer_,
ID)) %>%
select(-matches("row_"))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA <NA>
4 2 O
5 NA <NA>
6 NA <NA>
7 3 U
8 3 L
9 3 R
n <- max(table(df$ID))
df %>%
group_by(ID) %>%
summarise(col1 =`length<-`(col1, n), .groups = 'drop') %>%
mutate(ID = `is.na<-`(ID, is.na(col1)))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Another base R solution using sequence.
print(
df[
sequence(
abs(rep(i <- rle(df$ID)$lengths, each = 2) - c(0L, max(i))),
rep(cumsum(c(1L, i))[-length(i) - 1L], each = 2) + c(0L, nrow(df)),
),
],
row.names = FALSE
)
#> ID col1
#> 1 A
#> 1 B
#> NA <NA>
#> 2 O
#> NA <NA>
#> NA <NA>
#> 3 U
#> 3 L
#> 3 R
I have a data frame like this:
name count
a 3
a 5
a 8
b 2
a 9
b 7
so I want to calculate the row differences group by name. so my code is:
data%>%group_by(Name)%>%mutate(last_count = lag(count),diff = count - last_count)
However, I get a result like the below table
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 8 1
b 7 2 5
But what I want should look like this:
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 NA NA
b 7 NA NA
Thanks in advance to whoever can help me fix it!
Does this work:
> library(dplyr)
> df %>% mutate(last_count = case_when(name == lag(name) ~ lag(count), TRUE ~ NA_real_),
diff = case_when(name == lag(name) ~ count - lag(count), TRUE ~ NA_real_))
# A tibble: 6 x 4
name count last_count diff
<chr> <dbl> <dbl> <dbl>
1 a 3 NA NA
2 a 5 3 2
3 a 8 5 3
4 b 2 NA NA
5 a 9 NA NA
6 b 7 NA NA
>
We could use rleid to create a grouping column based on the adjacent matching values in the 'name' column and then apply the diff
library(dplyr)
library(data.table)
data %>%
group_by(grp = rleid(name)) %>%
mutate(last_count = lag(count), diff = count - last_count) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 6 x 4
# name count last_count diff
# <chr> <int> <int> <int>
#1 a 3 NA NA
#2 a 5 3 2
#3 a 8 5 3
#4 b 2 NA NA
#5 a 9 NA NA
#6 b 7 NA NA
Or using base R with ave and rle
data$diff <- with(data, ave(count, with(rle(name),
rep(seq_along(values), lengths)), FUN = function(x) c(NA, diff(x)))
data
data <- structure(list(name = c("a", "a", "a", "b", "a", "b"), count = c(3L,
5L, 8L, 2L, 9L, 7L)), class = "data.frame", row.names = c(NA,
-6L))
I have data frame where some of the values are missing
A 1
A NA
A NA
B NA
B 2
B NA
C NA
C NA
C NA
How can I fill in groups where I have data?
You can also use fill from tidyr:
library(dplyr)
library(tidyr)
df1 %>%
group_by(ID) %>%
fill(v1) %>%
fill(v1, .direction = "up")
Result:
# A tibble: 9 x 2
# Groups: ID [3]
ID v1
<chr> <int>
1 A 1
2 A 1
3 A 1
4 B 2
5 B 2
6 B 2
7 C NA
8 C NA
9 C NA
Credits to #akrun for dput
Alternative solution, though perhaps a bit flawed in how many assumptions it makes:
library(dplyr)
y %>%
group_by(V1) %>%
arrange(V2) %>%
mutate(V2 = V2[1])
# Source: local data frame [9 x 2]
# Groups: V1 [3]
# V1 V2
# (chr) (int)
# 1 A 1
# 2 A 1
# 3 A 1
# 4 B 2
# 5 B 2
# 6 B 2
# 7 C NA
# 8 C NA
# 9 C NA
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', we assign (:=) the column 'v1' as the first non-NA value.
library(data.table)
setDT(df1)[, v1:= v1[!is.na(v1)][1L] , by = ID]
df1
# ID v1
#1: A 1
#2: A 1
#3: A 1
#4: B 2
#5: B 2
#6: B 2
#7: C NA
#8: C NA
#9: C NA
Or using only base R
with(df1, ave(v1, ID, FUN = function(x)
replace(x, is.na(x), x[!is.na(x)][1L])))
#[1] 1 1 1 2 2 2 NA NA NA
data
df1 <- structure(list(ID = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), v1 = c(1L, NA, NA, NA, 2L, NA, NA, NA, NA)), .Names = c("ID",
"v1"), class = "data.frame", row.names = c(NA, -9L))
I am trying to fill in blank cells with the value of rows above. Similar to na.locf function, but I have a pattern that needs to be matched. I don't necessarily know how many rows between new values (i.e betweem a,b and c,d).
I have used the na.locf and searched around for a solution to no avail.
df <- df <- data.frame(col1 = c("a","b", NA, NA, NA, NA, "c", "d", NA, NA))
df
# col1
# 1 a
# 2 b
# 3 <NA>
# 4 <NA>
# 5 <NA>
# 6 <NA>
# 7 c
# 8 d
# 9 <NA>
# 10 <NA>
Solution I would like:
df
col1
a
b
a
b
a
b
c
d
c
d
ave(df$col1,
with(rle(!is.na(df$col1)), rep(cumsum(values), lengths)),
FUN = function(x){
rep(x[!is.na(x)], length.out = length(x))
})
# [1] a b a b a b c d c d
Here's way with dplyr. You can drop the group column if needed. -
df %>%
group_by(group = cumsum(is.na(lag(col1)) & !is.na(col1))) %>%
mutate(
col1 = rep(col1[!is.na(col1)], length.out = n())
) %>%
ungroup()
# A tibble: 10 x 2
col1 group
<chr> <int>
1 a 1
2 b 1
3 a 1
4 b 1
5 a 1
6 b 1
7 c 2
8 d 2
9 c 2
10 d 2
I have the following data-set
ID COL1 COL2 COL3
1 22 12 NA
2 2 NA NA
3 1 2 4
4 NA NA NA
The above data needs to be converted into the following format
ID VALUE
1 22
1 12
2 2
3 1
3 2
3 4
Please note that NAs are present in the source data frame which should be ignored in the final table.
For speed with the larger datasets, use the data.table melt method:
library("data.table")
setDT(df)
melt(df, id.vars = "ID", na.rm = TRUE)
# ID variable value
# 1: 1 COL1 22
# 2: 2 COL1 2
# 3: 3 COL1 1
# 4: 1 COL2 12
# 5: 3 COL2 2
# 6: 3 COL3 4
library(dplyr)
library(tidyr)
gather(df, column, value, COL1:COL3, na.rm=TRUE) %>%
select(-column)
In base R, you could use lapply to go through columns and extract non NA elements and corresponding ID.
do.call(rbind, lapply(df[,-1], function(x)
data.frame(ID = df$ID[!is.na(x)], VALUE = x[!is.na(x)])))
# ID VALUE
#COL1.1 1 22
#COL1.2 2 2
#COL1.3 3 1
#COL2.1 1 12
#COL2.2 3 2
#COL3 3 4
If necessary, the order can be changed in one additional step
df2 = do.call(rbind, lapply(df[,-1], function(x)
data.frame(ID = df$ID[!is.na(x)], VALUE = x[!is.na(x)])))
do.call(rbind, split(df2, df2$ID))
# ID VALUE
#1.COL1.1 1 22
#1.COL2.1 1 12
#2 2 2
#3.COL1.3 3 1
#3.COL2.2 3 2
#3.COL3 3 4
DATA
df = structure(list(ID = 1:4, COL1 = c(22L, 2L, 1L, NA), COL2 = c(12L,
NA, 2L, NA), COL3 = c(NA, NA, 4L, NA)), .Names = c("ID", "COL1",
"COL2", "COL3"), class = "data.frame", row.names = c(NA, -4L))
Here is a base R option
d1 <- na.omit(data.frame(ID = rep(df1$ID, each = ncol(df1)-1), VALUE = c(t(df1[-1]))))
d1
# ID VALUE
#1 1 22
#2 1 12
#4 2 2
#7 3 1
#8 3 2
#9 3 4
Or we can use a compact option with data.table
library(data.table)
setDT(df1)[, unlist(.SD), .(ID)][!is.na(V1)]