Last observation carried forward conditional on value and colums - r

I have a longhitudinal dataframe with a lot of missing values that looks like this.
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)
I would like to carry forward the last observation based on two conditions:
1) when cond=0 it should carry on the observation the higher value of the variable of interest.
2) when cond=1 it should carry forward the lower value of the variable of interest.
Does anyone have an idea on how I could do this in an elegant way?
The final dataset should look like this
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, 1 , 2, 0, 0, NA, 3, 3, 0, 0,2,2,2,2,2)
final = data.frame(ID, date, cond,var)
So far I was able to carry forward the last observation, but I was unable to impose the conditions
library(zoo)
df <- df %>%
group_by(ID) %>%
mutate(var =
na.locf(var, na.rm = F))
any suggestion is welcomed

This is the use of accumulate2 ie
df%>%
group_by(ID)%>%
mutate(d = unlist(accumulate2(var,cond[-1],function(z,x,y) if(y) min(z,x,na.rm=TRUE) else max(z,x,na.rm=TRUE))))
# A tibble: 15 x 5
# Groups: ID [3]
ID date cond var d
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 1 1
2 1 2 0 NA 1
3 1 3 0 2 2
4 1 4 1 0 0
5 1 5 0 NA 0
6 2 1 0 NA NA
7 2 2 0 3 3
8 2 3 0 NA 3
9 2 4 1 0 0
10 2 5 0 NA 0
11 3 1 0 2 2
12 3 2 0 NA 2
13 3 3 0 1 2
14 3 4 0 NA 2
15 3 5 0 NA 2

I think, if I understand what you are after is this?
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)
Using case_when you can do some conditional checks. I'm unsure if you mean to return the minimum for all of the "ID" field, but this will look at the condition and then lag or lead to find a non missing value
library(dplyr)
df %>%
mutate(var_imput = case_when(
cond == 0 & is.na(var)~lag(x = var, n = 1, default = NA),
cond == 1 & is.na(var)~lead(x = var, n = 1, default = NA),
TRUE~var
))
Which yields:
ID date cond var var_imput
1 1 1 0 1 1
2 1 2 0 NA 1
3 1 3 0 2 2
4 1 4 1 0 0
5 1 5 0 NA 0
6 2 1 0 NA NA
7 2 2 0 3 3
8 2 3 0 NA 3
9 2 4 1 0 0
10 2 5 0 NA 0
11 3 1 0 2 2
12 3 2 0 NA 2
13 3 3 0 1 1
14 3 4 0 NA 1
15 3 5 0 NA NA
If you want to group by ID then you could generate an impute table by ID, then join it with the original table like this:
# enerate input table
input_table <- df %>%
group_by(ID) %>%
summarise(min = min(var, na.rm = T),
max = max(var, na.rm = T)) %>%
gather(cond, value, -ID) %>%
mutate(cond = ifelse(cond == "min", 0, 1))
# Join and impute missing
df %>%
left_join(input_table,by = c("ID", "cond")) %>%
mutate(var_imput = ifelse(is.na(var), value, var))

Related

Get the rowwise minimum of certain columns excluding 0 and NA

I have made a very complex solution to something I feel should have a much simpler solution.
In short what I want:
I want to compute a new column containing the minimum value across 3 columns
I want to ignore zeros and NAs
If I only have zeros and NAs I want a zero
If I have only NAs I want a NA
Here is my solution, it works, but it is very complex and produces a warning.
> library(dplyr)
> df <- data.frame(
+ id = c(1, 2, 3, 4),
+ test1 = c( NA, NA, 2 , 3),
+ test2 = c( NA, 0, 1 , 1),
+ test3 = c(NA, NA, 0 , 2)
+ )
> df2 <- df %>%
+ mutate(nieuw = apply(across(test1:test3), 1, function(x) min(x[x>0]))) %>%
+ rowwise() %>%
+ mutate(nieuw = if_else(is.na(nieuw), max(across(test1:test3), na.rm = TRUE), nieuw)) %>%
+ mutate(nieuw = ifelse(is.infinite(nieuw), NA, nieuw))
> df
id test1 test2 test3
1 1 NA NA NA
2 2 NA 0 NA
3 3 2 1 0
4 4 3 1 2
> df2
# A tibble: 4 x 5
# Rowwise:
id test1 test2 test3 nieuw
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA NA NA
2 2 NA 0 NA 0
3 3 2 1 0 1
4 4 3 1 2 1
Warning message:
Problem while computing `nieuw = if_else(...)`.
i no non-missing arguments to max; returning -Inf
i The warning occurred in row 1.
You can create a helper function and then apply it rowwise:
library(dplyr)
safe <- function(x, f, ...) ifelse(all(is.na(x)), NA,
ifelse(all(is.na(x) | x == 0),
0, f(x[x > 0], na.rm = TRUE, ...)))
df %>%
rowwise() %>%
mutate(a = safe(c_across(test1:test3), min))
# A tibble: 4 × 5
# Rowwise:
id test1 test2 test3 a
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA NA NA
2 2 NA 0 NA 0
3 3 2 1 0 1
4 4 3 1 2 1
Here is another option. It leverages making zeros and NA's very large and then recodes them at the end:
library(tidyverse)
get_min <- function(data, cols){
data[is.na(data)] <- 1e6
data[data == 0] <- 1e5
nums <- do.call(pmin, select(data, all_of(cols)))
recode(nums, `1e+06` = NA_real_, `1e+05` = 0.)
}
df %>%
mutate(nieuw = get_min(., c("test1", "test2", "test3")))
#> id test1 test2 test3 nieuw
#> 1 1 NA NA NA NA
#> 2 2 NA 0 NA 0
#> 3 3 2 1 0 1
#> 4 4 3 1 2 1

Counting observations and considering condition

I have a database like this:
id <- c(rep(1,3), rep(2, 3), rep(3, 3))
condition <- c(0, 0, 1, 0, 0, 1, 1, 1, 0)
time_point1 <- c(1, 1, NA)
time_point2 <- c(NA, 1, NA)
time_point3 <- c(NA, NA, NA)
time_point4 <- c(1, NA, NA, 1, NA, NA, NA, NA, 1)
data <- data.frame(id, condition, time_point1, time_point2, time_point3, time_point4)
data
id condition time_point1 time_point2 time_point3 time_point4
1 1 0 1 NA NA 1
2 1 0 1 1 NA NA
3 1 1 NA NA NA NA
4 2 0 1 NA NA 1
5 2 0 1 1 NA NA
6 2 1 NA NA NA NA
7 3 1 1 NA NA NA
8 3 1 1 1 NA NA
9 3 0 NA NA NA 1
I want to make a table with how many have the condition == 1 (n_x) and also how many are in each time point (n_t). In case there is none also I want a 0. I tried this:
data %>%
pivot_longer(cols = contains("time_point")) %>%
filter (!is.na(value)) %>%
group_by(name) %>%
mutate(n_t = n_distinct(id)) %>%
ungroup() %>%
filter(condition == 1) %>%
group_by(name) %>%
summarise(n_x = n_distinct(id), n_t = first(n_t))
Obtaining this:
name n_x n_t
<chr> <int> <int>
1 time_point1 1 3
2 time_point2 1 3
Desired Outcome: I want this type of table that considers the cases with condition and without it:
name n_x n_t
1 time_point1 2 6
2 time_point2 1 3
3 time_point3 0 0
4 time_point4 0 3
Thank you!
You can pivot_longer() to be able to group_by() time_points and then summarise just adding up the values. For conditions only sum values where the column values != NA.
data %>%
pivot_longer(cols=c(3:6),names_to = 'point', values_to='values') %>%
group_by(point) %>%
summarise(n_x = sum(condition[!is.na(values)]), n_t = sum(values, na.rm = TRUE))
Output:
# A tibble: 4 x 3
point n_x n_t
<chr> <dbl> <dbl>
1 time_point1 2 6
2 time_point2 1 3
3 time_point3 0 0
4 time_point4 0 3

Calculate sum of n previous rows

I have a quite big dataframe and I'm trying to add a new variable which is the sum of the three previous rows on a running basis, also it should be grouped by ID. The first three rows per ID should be 0. Here's what it should look like.
ID Var1 VarNew
1 2 0
1 2 0
1 3 0
1 0 7
1 4 5
1 1 7
Here's an example dataframe
ID <- c(1, 1, 1, 1, 1, 1)
Var1 <- c(2, 2, 3, 0, 4, 1)
df <- data.frame(ID, Var1)
You can use any of the package that has rolling calculation function with a window size of 3 and lag the result. For example with zoo::rollsumr.
library(dplyr)
df %>%
group_by(ID) %>%
mutate(VarNew = lag(zoo::rollsumr(Var1, 3, fill = 0), default = 0)) %>%
ungroup
# ID Var1 VarNew
# <dbl> <dbl> <dbl>
#1 1 2 0
#2 1 2 0
33 1 3 0
#4 1 0 7
#5 1 4 5
#6 1 1 7
You can use filter in ave.
df$VarNew <- ave(df$Var1, df$ID, FUN=function(x) c(0, 0, 0,
filter(head(df$Var1, -1), c(1,1,1), side=1)[-1:-2]))
df
# ID Var1 VarNew
#1 1 2 0
#2 1 2 0
#3 1 3 0
#4 1 0 7
#5 1 4 5
#6 1 1 7
or using cumsum in combination with head and tail.
df$VarNew <- ave(df$Var1, df$ID, FUN=function(x) {y <- cumsum(x)
c(0, 0, 0, tail(y, -3) - head(y, -3))})
Library runner also helps
library(runner)
df %>% mutate(var_new = sum_run(Var1, k =3, na_pad = T, lag = 1))
ID Var1 var_new
1 1 2 NA
2 1 2 NA
3 1 3 NA
4 1 0 7
5 1 4 5
6 1 1 7
NAs can be mutated to 0 if desired so, easily.

Replace dates of many columns with 1 and NA with 0

There are many columns here, and I need to replace the dates with 1 and NA with 0. I would like a dplyr solution. thank you.
df <- data.frame(
id = c(1,2,3),
diabetes = c("12-12-2007",NA,"2-12-2018"),
lipids = c(NA,NA,"12-12-2015"),
stringsAsFactors = FALSE
)
df %>% mutate(across(-id, ~ifelse(is.na(.), 0, 1)))
id diabetes lipids
1 1 1 0
2 2 0 0
3 3 1 1
You can do :
df[-1] <- +(!is.na(df[-1]))
df
# id diabetes lipids
#1 1 1 0
#2 2 0 0
#3 3 1 1

Replacing all values to 1 after a condition

My current data is like below,
df<-data.frame(id=c(1:5),t1=c(NA,1,0,0,0),t2=c(0,1,0,1,0),
t3=c(NA,0,0,0,1),t4=c(NA,NA,NA,0,0))
And the way I'm trying to restructure this is,
for each id, if there's a "1" in that row, all the 0s in the subsequent columns would change to 1. (but leaving the NA as an NA).
So for id#1, nothing would change since there's no 1 in that row, but for id#2, after 1 in the column t2, any 0s afterwards would be replaced by 1.
i.e., this is what I'm trying to get at the end:
final<-data.frame(id=c(1:5),t1=c(0,1,0,0,0),t2=c(0,1,0,1,0),
t3=c(NA,1,0,1,1),t4=c(NA,NA,NA,1,1))
I've been trying different ways but nothing seems to work... I'd really appreciate any help!!!
In base R we can apply the cummax by row after changing the NA to a lower value and then replace the value back to NA
df[-1] <- t(apply(replace(df[-1], is.na(df[-1]), -999), 1, cummax)) *
NA^(is.na(df[-1]))
df
# id t1 t2 t3 t4
#1 1 NA 0 NA NA
#2 2 1 1 1 NA
#3 3 0 0 0 NA
#4 4 0 1 1 1
#5 5 0 0 1 1
Or use rowCummaxs from matrixStats
library(matrixStats)
df[-1] <- rowCummaxs(as.matrix(replace(df[-1], is.na(df[-1]), -999))) *
NA^(is.na(df[-1]))
With tidyverse you can try:
library(tidyverse)
df %>%
pivot_longer(cols = starts_with("t"), names_to = "Time", values_to = "Value") %>%
group_by(id) %>%
mutate(Cummax = cummax(Value)) %>%
mutate(Value = replace(Value, Value == 0 & Cummax == 1, 1)) %>%
pivot_wider(id_cols = id, names_from = "Time", values_from = "Value")
Output
# A tibble: 5 x 5
# Groups: id [5]
id t1 t2 t3 t4
<int> <dbl> <dbl> <dbl> <dbl>
1 1 NA 0 NA NA
2 2 1 1 1 NA
3 3 0 0 0 NA
4 4 0 1 1 1
5 5 0 0 1 1
Another approach in base R using apply row-wise could be to find out column number where first 1 occurs and replace all the 0 values after it with 1.
df[-1] <- t(apply(df[-1], 1, function(x) {
a_id <- which(x == 1)[1]
if(length(a_id) > 0)
replace(x, x == 0 & seq_along(x) > a_id, 1)
else x
}))
df
# id t1 t2 t3 t4
#1 1 NA 0 NA NA
#2 2 1 1 1 NA
#3 3 0 0 0 NA
#4 4 0 1 1 1
#5 5 0 0 1 1

Resources