Last observation carried forward conditional on multiple columns - r

I have a dataset with this structure:
ID = c(1,1,1,1,2,2,2,3,3,3,3)
L40 = c(1, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA)
K50 = c(NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 1)
df = data.frame(ID, L40, K50)
# ID L40 K50
# 1 1 1 NA
# 2 1 NA NA
# 3 1 NA NA
# 4 1 NA NA
# 5 2 1 NA
# 6 2 NA 1
# 7 2 NA NA
# 8 3 NA NA
# 9 3 1 NA
# 10 3 NA NA
# 11 3 NA 1
When missing values occur in columns L40 and K50, I want to carry forward the last non-missing value in that column, conditional on ID being the same as the previous ID and the values in L40 and K50 in the current row being empty. I applied the following code:
library(tidyr)
df2 <- df %>% group_by(ID) %>% fill(L40:K50)
This does not achieve what I am looking for. I want the previous non-missing value to be carried forward into the next row only when the other columns (except ID) in that row are empty. This is what I want:
ID = c(1,1,1,1,2,2,2,3,3,3,3)
L40 = c(1, 1, 1, 1, 1, NA, NA, NA, 1, 1, NA)
K50 = c(NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, 1)
df3 = data.frame(ID, L40, K50)
df3
# ID L40 K50
# 1 1 1 NA
# 2 1 1 NA
# 3 1 1 NA
# 4 1 1 NA
# 5 2 1 NA
# 6 2 NA 1
# 7 2 NA 1
# 8 3 NA NA
# 9 3 1 NA
# 10 3 1 NA
# 11 3 NA 1

We can use na.locf
library(data.table)
library(zoo)
setDT(df)[, if(any(is.na(K50[-1]))) lapply(.SD, na.locf) else .SD , by = ID]
# ID L40 K50
#1: 1 1 NA
#2: 1 1 NA
#3: 1 1 NA
#4: 1 1 NA
#5: 2 1 NA
#6: 2 NA 1
#7: 3 NA 1
#8: 3 NA 1
#9: 3 NA 1
An option using dplyr would be
library(dplyr)
df %>%
mutate(ind = rowSums(is.na(.))) %>%
group_by(ID) %>%
mutate_each(funs(if(any(ind>1)) na.locf(., na.rm=FALSE) else .), L40:K50) %>%
select(-ind)
# ID L40 K50
# <dbl> <dbl> <dbl>
#1 1 1 NA
#2 1 1 NA
#3 1 1 NA
#4 1 1 NA
#5 2 1 NA
#6 2 NA 1
#7 3 NA 1
#8 3 NA 1
#9 3 NA 1

I played around with this question for a while, and with my limited knowledge of R I came up with the following work-around. I have added a date column to the original data frame for purpose of illustration:
ID = c(1,1,1,1,2,2,2,3,3,3,3)
date = c(1,2,3,4,1,2,3,1,2,3,4)
L40 = c(1, 1, NA, NA, 1, NA, NA, NA, 1, NA, NA)
K50 = c(NA, 1, 1, NA, NA, 1, NA, NA, NA, NA, 1)
df = data.frame(ID, date, L40, K50)
Here is what I did:
#gather the diagnosis columns in rows and keep only those rows where the patient has the associated diagnosis.
df1 <- df %>% gather(diagnos, dummy, L40:K50) %>% filter(dummy==1) %>% arrange(ID, date)
#concatenate across rows by ID and date to collect all diagnoses of an ID at a particular date.
df2 <- df1 %>% group_by(ID, date) %>% mutate(diag = paste(diagnos, collapse=" ")) %>% select(-diagnos, -dummy)
#convert into data tables in preparation for join
Dt1 <- data.table(df)
Dt2 <- data.table(df2)
setkey(Dt1, ID, date)
setkey(Dt2, ID, date)
#Each observation in Dt1 is matched with the observation in Dt1 with the same date or, if that particular date is not present,
#by the nearest previous date:
final <- Dt2[Dt1, roll=TRUE] %>% distinct()
This carries forward the name(s) of the diagnosis until the next observed diagnosis.

Related

Forming a new column from whichever of two columns isn’t NA [duplicate]

This question already has answers here:
Replace a value NA with the value from another column in R
(5 answers)
Closed last month.
I have a simplified dataframe:
test <- data.frame(
x = c(1,2,3,NA,NA,NA),
y = c(NA, NA, NA, 3, 2, NA),
a = c(NA, NA, NA, NA, NA, TRUE)
)
I want to create a new column rating that has the value of the number in either column x or column y. The dataset is such a way that whenever there's a numeric value in x, there's a NA in y. If both columns are NAs, then the value in rating should be NA.
In this case, the expected output is: 1,2,3,3,2,NA
With coalesce:
library(dplyr)
test %>%
mutate(rating = coalesce(x, y))
x y a rating
1 1 NA NA 1
2 2 NA NA 2
3 3 NA NA 3
4 NA 3 NA 3
5 NA 2 NA 2
6 NA NA TRUE NA
library(dplyr)
test %>%
mutate(rating = if_else(is.na(x),
y, x))
x y a rating
1 1 NA NA 1
2 2 NA NA 2
3 3 NA NA 3
4 NA 3 NA 3
5 NA 2 NA 2
6 NA NA TRUE NA
Here several solutions.
# Input
test <- data.frame(
x = c(1,2,3,NA,NA,NA),
y = c(NA, NA, NA, 3, 2, NA),
a = c(NA, NA, NA, NA, NA, TRUE)
)
# Base R solution
test$rating <- ifelse(!is.na(test$x), test$x,
ifelse(!is.na(test$y), test$y, NA))
# dplyr solution
library(dplyr)
test <- test %>%
mutate(rating = case_when(!is.na(x) ~ x,
!is.na(y) ~ y,
TRUE ~ NA_real_))
# data.table solution
library(data.table)
setDT(test)
test[, rating := ifelse(!is.na(x), x, ifelse(!is.na(y), y, NA))]
Created on 2022-12-23 with reprex v2.0.2
test <- data.frame(
x = c(1,2,3,NA,NA,NA),
y = c(NA, NA, NA, 3, 2, NA),
a = c(NA, NA, NA, NA, NA, TRUE)
)
test$rating <- dplyr::coalesce(test$x, test$y)

R - How to fill in values in NA, but only when ending value is the same as the beginning value?

I have the following example data:
Example <- data.frame(col1 =c(1, NA, NA, 4, NA, NA, 6, NA, NA, NA, 6, 8, NA, 2, NA))
col1
1
NA
NA
4
NA
NA
6
NA
NA
NA
6
8
NA
2
NA
I want to fill the NAs with value from above, but only if the NAs are between 2 identical values. In this example the first NA gap from 1 to 4 should not be filled with 1s. But the gap between the first 6 and the second 6 should be filled, with 6s. All other values should stay NA.
Therefore, afterwards it should look like:
col1
1
NA
NA
4
NA
NA
6
6
6
6
6
8
NA
2
NA
But in reality I do not have only 15 observations, but over 50000. Therefore I need a efficient solution, which is more difficult than I thought. I tried to use the Fill function but was not able to come up with a solution.
One dplyr and zoo option could be:
df %>%
mutate(cond = na.locf0(col1) == na.locf0(col1, fromLast = TRUE),
col1 = ifelse(cond, na.locf0(col1), col1)) %>%
select(-cond)
col1
1 1
2 NA
3 NA
4 4
5 NA
6 NA
7 6
8 6
9 6
10 6
11 6
12 8
13 NA
14 2
15 NA
Here is a dply solution:
First I create the data in tibble format:
df <- tibble(
x = c(1, NA_real_, NA_real_,
4, NA_real_, NA_real_,
6, NA_real_, NA_real_, NA_real_,
6, 8, NA_real_, 2, NA_real_)
)
Next, I create two grouping variables which will be helpful in identifying the first and the last non-NA value.
I then save these reference values to ref_start and ref_end.
In the end I overwrite the values of x:
df %>%
mutate(gr1 = cumsum(!is.na(x))) %>%
group_by(gr1) %>%
mutate(ref_start = first(x)) %>%
ungroup() %>%
mutate(gr2 = lag(gr1, default = 1)) %>%
group_by(gr2) %>%
mutate(ref_end = last(x)) %>%
ungroup() %>%
mutate(x = if_else(is.na(x) & ref_start == ref_end, ref_start, x))
# A tibble: 15 x 1
x
<dbl>
1 1
2 NA
3 NA
4 4
5 NA
6 NA
7 6
8 6
9 6
10 6
11 6
12 8
13 NA
14 2
15 NA
df <- data.frame(col1 =c(1, NA, NA, 4, NA, NA, 6, NA, NA, NA, 6, 8, NA, 2, NA))
library(data.table)
library(magrittr)
setDT(df)[!is.na(col1), n := .N, by = col1] %>%
.[, n := nafill(n, type = "locf")] %>%
.[n == 2, col1 := nafill(col1, type = "locf")] %>%
.[, n := NULL] %>%
.[]
#> col1
#> 1: 1
#> 2: NA
#> 3: NA
#> 4: 4
#> 5: NA
#> 6: NA
#> 7: 6
#> 8: 6
#> 9: 6
#> 10: 6
#> 11: 6
#> 12: 8
#> 13: NA
#> 14: 2
#> 15: NA
Created on 2021-10-11 by the reprex package (v2.0.1)
Here is a tidyverse approach using dplyr and tidyr:
Logic:
Create an id column
Remove all na rows
Flag if next value is the same
right_join with first Example df
fill down flag and corresponding col1.y
mutate with an ifelse
library(dplyr)
library(tidyr)
Example <- Example %>%
mutate(id=row_number())
Example %>%
na.omit() %>%
mutate(flag = ifelse(col1==lead(col1), TRUE, FALSE)) %>%
right_join(Example, by="id") %>%
arrange(id) %>%
fill(col1.y, .direction="down") %>%
fill(flag, .direction="down") %>%
mutate(col1.x = ifelse(flag==TRUE, col1.y, col1.x), .keep="unused") %>%
select(col1 = col1.x)
Output:
col1
1 1
2 NA
3 NA
4 4
5 NA
6 NA
7 6
8 6
9 6
10 6
11 6
12 8
13 NA
14 2
15 NA
The solution above with data.table (from Yuriy Saraykin) works only for the example. As Daniel Hendrick comments : Seems as the NAs get filled after the begining and ending value, where it should really end. Like if the data would be: (6, NA, NA, 6, NA, 8) your dplyr solution would give out: (6, 6, 6, 6, 6, 8).
Here is an another proposition with data.table:
library(data.table)
df <- data.table(col1 =c(1, NA, NA, 4, NA, NA, 6, NA, NA, NA, 6, NA, NA, 8, NA, 2, NA))
cond = nafill(df$col1, type = "locf") == nafill(df$col1, type = "nocb")
df[which(cond==T), col1 := nafill(df$col1, type = "locf")[which(cond==T)]]
df$col1
[1] 1 NA NA 4 NA NA 6 6 6 6 6 NA NA 8 NA 2 NA

Create multiple sequences dependent on data frame column

Starting with data with the start of the desired sequences filled in with 1, I need to fill in the NA rows with sequences. Below is the starting data (first two columns) and the desired third column:
I can make this happen with a loop, below, but what is the better R programming way to do it?
for(i in 1:length(df2$col2)) {
df2$col3[i] <- ifelse(df2$col2[i] == 1, 1, df2$col3[i - 1] + 1)
if(is.na(df2$col2[i])) df2$col3[i] <- df2$col3[i - 1] + 1
}
Here is a 20-row data set of the first two columns:
structure(list(col1 = c(478.69, 320.45, 503.7, 609.3, 478.19,
478.69, 320.45, 503.7, 609.3, 478.19, 419.633683050051, 552.939975773916,
785.119385505095, 18.2542654918507, 98.6469651805237, 132.587260054424,
697.119552921504, 512.560374778695, 916.425200179219, 14.3385051051155
), col2 = c(1, NA, 1, NA, NA, 1, NA, 1, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-20L))
Try:
library(data.table)
df2 <- data.table(df2)
df2[, col3 := col2[1] + 1 * (1:.N - 1), by = .(cumsum(!is.na(col2)))]
You can use ave with seq_along with grouping using cumsum.
df2$col3 <- ave(integer(nrow(df2)), cumsum(!is.na(df2$col2)), FUN=seq_along)
df2
# col1 col2 col3
#1 478.69000 1 1
#2 320.45000 NA 2
#3 503.70000 1 1
#4 609.30000 NA 2
#5 478.19000 NA 3
#6 478.69000 1 1
#7 320.45000 NA 2
#8 503.70000 1 1
#9 609.30000 NA 2
#10 478.19000 NA 3
#11 419.63368 NA 4
#12 552.93998 NA 5
#13 785.11939 1 1
#14 18.25427 NA 2
#15 98.64697 NA 3
#16 132.58726 NA 4
#17 697.11955 NA 5
#18 512.56037 NA 6
#19 916.42520 NA 7
#20 14.33851 NA 8

Find max value within a data frame interval

I have a dataframe that has x/y values every 5 seconds, with a depth value every second (time column). There is no depth where there is an x/y value.
x <- c("1430934", NA, NA, NA, NA, "1430939")
y <- c("4943206", NA, NA, NA, NA, "4943210")
time <- c(1:6)
depth <- c(NA, 10, 19, 84, 65, NA)
data <- data.frame(x, y, time, depth)
data
x y time depth
1 1430934 4943206 1 NA
2 NA NA 2 10
3 NA NA 3 19
4 NA NA 4 84
5 NA NA 5 65
6 1430939 4943210 6 NA
I would like to calculate the maximum depth between the x/y values that are not NA and add this to a new column in the row of the starting x/y values. So max depth of rows 2-5. An example of the output desired.
x y time depth newvar
1 1430934 4943206 1 NA 84
2 NA NA 2 10 NA
3 NA NA 3 19 NA
4 NA NA 4 84 NA
5 NA NA 5 65 NA
6 1430939 4943210 6 NA NA
This is to repeat whenever a new x/y value is present.
You can use ave and cumsum with !is.na to get the groups for ave like:
data$newvar <- ave(data$depth, cumsum(!is.na(data$x)), FUN=
function(x) if(all(is.na(x))) NA else {
c(max(x, na.rm=TRUE), rep(NA, length(x)-1))})
data
# x y time depth newvar
#1 1430934 4943206 1 NA 84
#2 <NA> <NA> 2 10 NA
#3 <NA> <NA> 3 19 NA
#4 <NA> <NA> 4 84 NA
#5 <NA> <NA> 5 65 NA
#6 1430939 4943210 6 NA NA
Using dplyr, we can create groups of every 5 rows and update the first row in group as max value in the group ignoring NA values.
library(dplyr)
df %>%
group_by(grp = ceiling(time/5)) %>%
mutate(depth = ifelse(row_number() == 1, max(depth, na.rm = TRUE), NA))
In base R, we can use tapply :
inds <- seq(1, nrow(df), 5)
df$depth[inds] <- tapply(df$depth, ceiling(df$time/5), max, na.rm = TRUE)
df$depth[-inds] <- NA
Maybe you can try ave like below
df <- within(df,
newvar <- ave(depth,
ceiling(time/5),
FUN = function(x) ifelse(length(x)>1&is.na(x),max(na.omit(x)),NA)))
such that
> df
x y time depth newvar
1 1430934 4943206 1 NA 84
2 NA NA 2 10 NA
3 NA NA 3 19 NA
4 NA NA 4 84 NA
5 NA NA 5 65 NA
6 1430939 4943210 6 NA NA
DATA
df <- structure(list(x = c(1430934L, NA, NA, NA, NA, 1430939L), y = c(4943206L,
NA, NA, NA, NA, 4943210L), time = 1:6, depth = c(NA, 10L, 19L,
84L, 65L, NA)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6"))
Here is another option using data.table:
library(data.table)
setDT(data)[, newvar := replace(frollapply(depth, 5L, max, na.rm=TRUE, align="left"),
seq(.N) %% 5L != 1L, NA_integer_)]

Replace the values of NA with a sum of previous value and a current value in different column

I have a dataset where I have to fill NA values using the previous value and a sum of current value in another column. Basically, my data looks like
library(lubridate)
library(tidyverse)
library(zoo)
df <- tibble(
Id = c(1, 1, 1, 1, 2, 2, 2, 2),
Time = ymd(c("2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04")),
av = c(18, NA, NA, NA, 21, NA, NA, NA),
Value = c(121, NA,NA, NA, 146, NA, NA, NA)
)
# A tibble: 8 x 4
Id Time av Value
<dbl> <date> <dbl> <dbl>
1 2012-09-01 18 121
1 2012-09-02 NA NA
1 2012-09-03 NA NA
1 2012-09-04 NA NA
2 2012-09-01 21 146
2 2012-09-02 NA NA
2 2012-09-03 NA NA
2 2012-09-04 NA NA
What I want to do is: where the Value is NA, I want to replace it by sum of previous Value and current value of av. If av is NA, it can be replaced with previous value. I use na.locf function from zoo package as
df1 <- df %>% arrange(Id, Time) %>% group_by(Id) %>%
mutate(av = zoo::na.locf(av))
However, filling in for Value seems to be difficult. I can do it using for loop as
# Back up the Value column for testing
df1$Value_backup <- df1$Value
for(i in 2:nrow(df1))
{
df1$Value[i] <- ifelse(is.na(df1$Value[i]), df1$av[i] + df1$Value[i-1], df1$Value[i])
}
This produces the result I want but for a large dataset, I believe there are better ways to do it in R. I tried complete function from dplyr but it adds two additional rows as:
df1 <- df %>% arrange(Id, Time) %>% group_by(Id) %>% mutate(av = zoo::na.locf(av)) %>%
mutate(num_rows = n()) %>%
complete(nesting(Id), Value = seq(min(Value, na.rm = TRUE),
(min(Value, na.rm = TRUE) + max(num_rows) * min(na.omit(av))), min(na.omit(av))))
The output has two extra rows; 10 instead of 8
# A tibble: 10 x 5
# Groups: Id [2]
Id Value Time av num_rows
<dbl> <dbl> <date> < dbl> <int>
1 121 2012-09-01 18 4
1 139 NA NA NA
1 157 NA NA NA
1 175 NA NA NA
1 193 NA NA NA
2 146 2012-09-01 21 4
2 167 NA NA NA
2 188 NA NA NA
2 209 NA NA NA
2 230 NA NA NA
Any help to do it faster without loops would be greatly appreciated.
In the question av starts with a non-NA in each group and is followed by NAs so if this is the general pattern then this will work. Note that it is good form to close any group_by with ungroup; however, we did not do that below so that we could compare df2 with df1.
df2 <- df %>%
group_by(Id) %>%
mutate(Value_backup = Value,
av = first(av),
Value = first(Value) + cumsum(av) - av)
identical(df1, df2)
## [1] TRUE
Note
For reproducibility first run this (taken from question except we only load needed packages):
library(dplyr)
library(tibble)
library(lubridate)
df <- tibble(
Id = c(1, 1, 1, 1, 2, 2, 2, 2),
Time = ymd(c("2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "
2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04")),
av = c(18, NA, NA, NA, 21, NA, NA, NA),
Value = c(121, NA,NA, NA, 146, NA, NA, NA)
)
df1 <- df %>% arrange(Id, Time) %>% group_by(Id) %>%
mutate(av = zoo::na.locf(av))
df1$Value_backup <- df1$Value
for(i in 2:nrow(df1))
{
df1$Value[i] <- ifelse(is.na(df1$Value[i]), df1$av[i] + df1$Value[i-1], df1$Value[i])
}

Resources