modify first entry of grouped data based on subsequent entries - r

Using the dataset below:
df <- structure(list(test = c("1st", "2nd", "3rd", "1st", "2nd", "3rd", "1st", "2nd", "3rd"),
id = c("PID1", "PID1", "PID1", "PID2", "PID2", "PID2", "PID3", "PID3", "PID3"),
date = c("2020-01-01", "2020-01-13", "2020-01-17", "2020-01-01", "2020-01-13", "2020-01-20", "2020-01-01", "2020-01-14", "2020-01-18"),
status_1 = c("Symp", "Symp", "uninfected", "Asymp", "Symp", "uninfected", "Asymp", "Asymp", "uninfected"),
status_2 = c("Symp", "Symp", "uninfected", "pre-Symp", "Symp", "uninfected", "Asymp", "Asymp", "uninfected")),
class = "data.frame",
row.names = c(NA, -9L)
)
I'd like to identify individuals whose status_1 is Asymp at the first test and check whether within 14 days they are Symp.
Such individuals should have their status_2 change to pre-Symp, otherwise, their status should remain the same such e.g. individual with id PID3.
This individual is Asymp after the first test and does not transition to Symp within 14 days so his status remains Asymp
Here's the code I tried to put together but I keep running into errors:
df <- df %>%
mutate(status_2 = case_when(test == "1st" &
status_1 == "Asymp" &
status_1[date + 14] != "Symp" ~ "pre-Symp",
TRUE ~ status_1))

You can create your own custom function which checks for the status within that time interval.
library(dplyr)
check_status <- function(x, date) {
if(first(x) == 'Asymp' & any(x == 'Symp')) {
if (date[which.max(x == 'Symp')] - first(date) < 14)
x[1] <- 'pre-Symp'
}
return(x)
}
Now apply this function by group :
df %>%
mutate(date = as.Date(date)) %>%
group_by(id) %>%
mutate(status_2 = check_status(status_1, date))
# test id date status_1 status_2
# <chr> <chr> <date> <chr> <chr>
#1 1st PID1 2020-01-01 Symp Symp
#2 2nd PID1 2020-01-13 Symp Symp
#3 3rd PID1 2020-01-17 uninfected uninfected
#4 1st PID2 2020-01-01 Asymp pre-Symp
#5 2nd PID2 2020-01-13 Symp Symp
#6 3rd PID2 2020-01-20 uninfected uninfected
#7 1st PID3 2020-01-01 Asymp Asymp
#8 2nd PID3 2020-01-14 Asymp Asymp
#9 3rd PID3 2020-01-18 uninfected uninfected

Related

mutate variable based on other columns with similar names

I have a df here (the desired output, my starting df does not have the Flag variable):
df <- data.frame(
Person = c('1','2','3'),
Date = as.Date(c('2010-09-30', '2012-11-20', '2015-03-11')),
Treatment_1 = as.Date(c('2010-09-30', '2012-11-21', '2015-03-22')),
Treatment_2 = as.Date(c('2011-09-30', 'NA', '2011-03-22')),
Treatment_3 = as.Date(c('2012-09-30', '2015-11-21', '2015-06-22')),
Surgery_1 = as.Date(c(NA, '2016-11-21', '2015-03-12')),
Surgery_2 = as.Date(c(NA, '2017-11-21', '2019-03-12')),
Surgery_3 = as.Date(c(NA, '2018-11-21', '2013-03-12')),
Flag = c('', 'Y', '')
)
and I want to derive the Flag variable based on these conditions:
For any column that starts with Treatment, set Flag to "" if Date = Treatment
For any column that starts with Surgery, set Flag to "" if Date = Surgery OR Date = Surgery +1 OR Date = Surgery - 1 (basically if the Surgery date is on the day, one day before, or one day after the Date variable, set Flag to "").
else set Flag = "Y"
I've looked into mutate_at but that rewrites the variables and assigns values of True/False.
This is wrong but this is my attempt:
df2 <- df %>%
mutate(Flag = case_when(
vars(starts_with("Treatment"), Date == . ) ~ '',
vars(starts_with("Surgery"), Date == . | Date == . - 1 | Date == . + 1) ~ '',
TRUE ~ 'Y')
)
UPDATE 2022-Aug-22
When I change a cell with the same date as the one in row 2:
df <- data.frame(
Person = c('1','2','3'),
Date = as.Date(c('2010-09-30', '2012-11-20', '2015-03-11')),
Treatment_1 = as.Date(c('2010-09-30', '2012-11-21', '2015-03-22')),
Treatment_2 = as.Date(c('2011-09-30', 'NA', '2011-03-22')),
Treatment_3 = as.Date(c('2012-09-30', '2015-11-21', '2015-06-22')),
Surgery_1 = as.Date(c(NA, '2016-11-21', '2015-03-12')),
Surgery_2 = as.Date(c(NA, '2017-11-21', '2019-03-12')),
Surgery_3 = as.Date(c(NA, '2018-11-21', '2012-11-20')),
Flag = c('', 'Y', '')
)
and then re-run the base R solution, the Flag in the second row is no longer "Y" but it should be as in that row, it doesn't meet any of the above conditions.
We can use rowwise and c_across along with any for each condition in case_when. Then, we can make a list for the Date (and +1, -1 days) for Surgery to match.
library(tidyverse)
df %>%
rowwise() %>%
mutate(Flag = case_when(
any(c_across(starts_with("Treatment")) == Date) ~ "",
any(c_across(starts_with("Surgery")) %in% c(Date, (Date +1), (Date-1))) ~ "",
TRUE ~ "Y"
))
Output
Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3 Flag
<chr> <date> <date> <date> <date> <date> <date> <date> <chr>
1 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 NA NA NA ""
2 2 2012-11-20 2012-11-21 NA 2015-11-21 2016-11-21 2017-11-21 2018-11-21 "Y"
3 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12 ""
Update
Here is a possible base R solution that is a lot quicker than tidyverse. This could be done in one line of code, but I decided that readability is better. First, I duplicate the Surgery columns so that we have +1 day and -1 day, and then convert these columns to character. Then, I subset the Treatment columns and convert to character. I convert to character as you cannot compare Date with %in% or ==. Then, I bind the date, treatment, and surgery columns together (a). Then, I use an ifelse for if the Date is in any of the columns but doing it row by row with apply, then we return "" and if not then return Y. Then, I bind the result back to the original dataframe (minus Flag from your original dataframe).
dup_names <- colnames(df)[startsWith(colnames(df), "Surgery")]
surgery <-
cbind(df[dup_names], setNames(df[dup_names] + 1, paste0(dup_names, "_range1")))
surgery <-
sapply(cbind(surgery, setNames(df[dup_names] - 1, paste0(
dup_names, "_range2"
))), as.character)
treatment <-
sapply(df[startsWith(colnames(df), "Treatment")], as.character)
a <- cbind(Date = as.character(df$Date), treatment, surgery)
cbind(subset(df, select = -Flag),
Flag = ifelse(apply(a[,1]==a[,2:ncol(a)], 1, any, na.rm = TRUE), "", "Y"))
Benchmark
Here is an alternative using across approach:
library(tidyverse)
df %>%
mutate(across(starts_with("Treatment"), ~as.numeric(. %in% Date), .names ="new_{.col}"),
across(starts_with("Surgery"), ~as.numeric(. %in% c(Date, Date+1, Date-1)), .names ="new_{.col}")) %>%
mutate(Flag = ifelse(rowSums(select(., contains('new')))==1, "", "Y"), .keep="used") %>%
bind_cols(df)
Flag Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3
1 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 <NA> <NA> <NA>
2 Y 2 2012-11-20 2012-11-21 <NA> 2015-11-21 2016-11-21 2017-11-21 2018-11-21
3 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12
Updated to add data.table approach
If you want a data.table approach, here it is:
df[melt(df, id=c(1,2))[,flag:=fifelse(
(str_starts(variable,"T") & value==Date) |
(str_starts(variable,"S") & abs(value-Date)<=1),"", "Y")][
, .(flag=min(flag,na.rm=T)), Person], on=.(Person)]
Output
Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3 flag
1: 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 <NA> <NA> <NA>
2: 2 2012-11-20 2012-11-21 <NA> 2015-11-21 2016-11-21 2017-11-21 2018-11-21 Y
3: 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12
I like Andrew's approach, but I was working on this when his answer came in, so here it is in case you are interested
df %>% inner_join(
pivot_longer(df, cols=Treatment_1:Surgery_3) %>%
mutate(flag=case_when(
(str_starts(name,"T") & value==Date) | (str_starts(name,"S") & abs(value-Date)<=1) ~ "",
TRUE ~"Y")) %>%
group_by(Person) %>%
summarize(flag = min(flag))
)
Output:
Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3 flag
1 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 <NA> <NA> <NA>
2 2 2012-11-20 2012-11-21 <NA> 2015-11-21 2016-11-21 2017-11-21 2018-11-21 Y
3 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12

How can I create new columns, based on the numeric value of another column

I have a df (chpt4) with 1000+ participants, and the dates when tests where taken. I would like to accomodate the dates according to how many months have passed between the follow up (t1:t4) and the baseline (t0). For this purpose I created 4 additional columns (difft0t2:difft0t4) that show exaclty the months elapsed between the tests. The image is what I have now.
I am grouping the months in 5 different categories: (also I thought this vectors would help me as a counter)
FU6 <- 1:9
FU12 <- 10:18
FU24 <- 19:30
FU36 <- 31:42
FU48 <- 43:54
My original idea was to start indexing the values of the difft0t1 column, that belong to the above ranges using which()
which(chpt4$difft0t1 %in% c(FU6)) #this works
which(chpt4$difft0t1 %in% c(FU14)) #this doesn't work at all
...and use that outcome number, as an index of which element to paste into another column. Its just not working.
keeping with the image example from lines 243 and 244, I would like to outcome columns to look like this:
baseline
FU6
FU12
FU24
FU36
FU48
2012-02-24
NA
2013-09-06
2014-02-21
2015-06-23
NA
2012-05-24
NA
2013-05-16
NA
2015-04-20
2016-05-12
I think you need this
library (tidyverse)
df %>% pivot_longer(cols = -id, names_to = "Test", values_to = "Dates") %>%
group_by(id) %>% mutate(new_col = as.numeric(round((Dates - first(Dates))/30,0))) %>%
mutate(new_col = case_when(new_col == 0 ~ "Baseline",
new_col %in% 1:9 ~ "FU6",
new_col %in% 10:18 ~ "FU12",
new_col %in% 19:30 ~ "FU24",
new_col %in% 31:42 ~ "FU36",
new_col %in% 43:54 ~ "FU48")) %>% filter(!is.na(new_col)) %>%
select(-Test) %>% pivot_wider(id_cols = "id", names_from = "new_col", values_from = "Dates", values_fn = min)
# A tibble: 4 x 6
# Groups: id [4]
id Baseline FU12 FU24 FU36 FU48
<chr> <date> <date> <date> <date> <date>
1 waa000 2012-10-04 2013-09-05 NA NA NA
2 waf84 2012-02-24 NA 2013-09-06 2015-06-23 NA
3 waq593 2012-05-24 2013-05-16 NA 2015-04-20 2016-05-12
4 wcu776 2013-01-24 2014-01-23 NA NA NA
NOTE whenever there will be two dates in one group, minimum/first of those will be displayed. FU6 category will automatically in picture once the appropriate data is used.
sample data used
dput(df)
> dput(df)
structure(list(id = c("waa000", "waf84", "waq593", "wcu776"),
t0 = structure(c(15617, 15394, 15484, 15729), class = "Date"),
t1 = structure(c(15953, 15954, 15841, 16093), class = "Date"),
t2 = structure(c(NA, 16122, 16545, NA), class = "Date"),
t3 = structure(c(NA, 16609, 16933, NA), class = "Date"),
t4 = structure(c(NA_real_, NA_real_, NA_real_, NA_real_), class = "Date")), row.names = c(NA,
-4L), class = "data.frame")
> df
id t0 t1 t2 t3 t4
1 waa000 2012-10-04 2013-09-05 <NA> <NA> <NA>
2 waf84 2012-02-24 2013-09-06 2014-02-21 2015-06-23 <NA>
3 waq593 2012-05-24 2013-05-16 2015-04-20 2016-05-12 <NA>
4 wcu776 2013-01-24 2014-01-23 <NA> <NA> <NA>

How to repeate the value of the last day of February for a leap year in R?

I have a data.frame that doesn't account for leap year (ie all years are 365 days). I would like to repeat the last day value in February during the leap year. The DF in my code below has fake data set, I intentionally remove the leap day value in DF_NoLeapday. I would like to add a leap day value in DF_NoLeapday by repeating the value of the last day of February in a leap year (in our example it would Feb 28, 2004 value). I would rather like to have a general solution to apply this to many years data.
set.seed(55)
DF <- data.frame(date = seq(as.Date("2003-01-01"), to= as.Date("2005-12-31"), by="day"),
A = runif(1096, 0,10),
Z = runif(1096,5,15))
DF_NoLeapday <- DF[!(format(DF$date,"%m") == "02" & format(DF$date, "%d") == "29"), ,drop = FALSE]
We can use complete on the 'date' column which is already a Date class to expand the rows to fill in the missing dates
library(dplyr)
library(tidyr)
out <- DF_NoLeapday %>%
complete(date = seq(min(date), max(date), by = '1 day'))
dim(out)
#[1] 1096 3
out %>%
filter(date >= '2004-02-28', date <= '2004-03-01')
# A tibble: 3 x 3
# date A Z
# <date> <dbl> <dbl>
#1 2004-02-28 9.06 9.70
#2 2004-02-29 NA NA
#3 2004-03-01 5.30 7.35
By default, the other columns values are filled with NA, if we need to change it to a different value, it can be done within complete with fill
If we need the previous values, then use fill
out <- out %>%
fill(A, Z)
out %>%
filter(date >= '2004-02-28', date <= '2004-03-01')
# A tibble: 3 x 3
# date A Z
# <date> <dbl> <dbl>
#1 2004-02-28 9.06 9.70
#2 2004-02-29 9.06 9.70
#3 2004-03-01 5.30 7.35

search every nth consecutive or frozen character in R

date val cal_val
1/12/2017 0:15 (0_04),(1_08),(0_12),(1_14) (0_04),(1_08),(0_12),(1_14)
1/12/2017 0:30 (0_22),(0_25),(1_29) (0_22),(1_29)
1/12/2017 0:45 (1_34),(1_38),(0_40),(1_44) (1_38),(0_40),(1_44)
1/12/2017 1:00 (1_47),(1_49),(1_53),(1_57),(0_59) (1_57),(0_59)
1/12/2017 1:15 (0_07),(0_09),(0_10),(0_13),(1_14) (0_7),(1_14)
How to search every single char after special char "(" and if they are consecutive or frozen
value with "0" then then consider min of value just after "_" else if it is "1" consider from max position , if there is no consecutive values the it remains
same.
i.e in row_1 : there is no consecutive values.
row_2 : (0_22),(0_25) are consecutive then consider min i.e (0_22) and later
row_3 : (1_34),(1_38) are consecutive then consider max i.e (1_38) and later
row_4 : (0_07),(0_09),(0_10),(0_13),(1_14) are consecutive then consider min i.e (0_7) and later
Thanks in advance.
Here's a tidyverse solution:
You can use stringr functions to pull out the 0-matching and 1-matching cases separately, and then combine them after applying min/max as specified:
df %>%
rowwise() %>%
mutate(
zero = min(
as.numeric(
str_extract_all(
str_extract(val, "(\\(0_\\d+\\),){2,}"), # find 0-consecutives
"\\d{2}")[[1]])), # pull out the 2-digit values
one = max(
as.numeric(
str_extract_all(
str_extract(val, "(\\(1_\\d+\\),){2,}"), # find 1-consecutives
"\\d{2}")[[1]])),
final = sum(zero, one, na.rm=TRUE))
# A tibble: 5 x 5
date val zero one final
<chr> <chr> <dbl> <dbl> <dbl>
1 1/12/2017 0:15 (0_04),(1_08),(0_12),(1_14) NA NA 0.
2 1/12/2017 0:30 (0_22),(0_25),(1_29) 22. NA 22.
3 1/12/2017 0:45 (1_34),(1_38),(0_40),(1_44) NA 38. 38.
4 1/12/2017 1:00 (1_47),(1_49),(1_53),(1_57… NA 57. 57.
5 1/12/2017 1:15 (0_07),(0_09),(0_10),(0_13… 7. NA 7.
Another approach could be
library(tidyverse)
library(data.table)
#prepare data to count consecutive 0 or 1
df1 <- df %>%
mutate(val = gsub("[()]", "", val)) %>%
separate_rows(val, sep = ",") %>%
separate("val", c("val_pre", "val_post"))
#identify consecutive 0 or 1 - TRUE in 'flag' column indicates consecutive 0 or 1
setDT(df1)[, seq_ind := seq(.N), by = .(date_col, rleid(val_pre))
][, flag := shift(seq_ind, type="lead",) > 1 | seq_ind > 1, by = date_col]
#filter consecutive rows. In there zero's repetition is replaced with min value & 1's repetition with max value
df2 <- setDF(df1) %>%
filter(flag == T) %>%
group_by(date_col, val_pre) %>%
mutate(val_post = ifelse(val_pre == 0, min(val_post), max(val_post))) %>%
#row-bind non-consecutive rows as is
bind_rows(setDF(df1) %>% filter(flag == F | is.na(flag))) %>%
select(-seq_ind, -flag) %>%
distinct() %>%
mutate(cal_val = paste0("(", val_pre, "_", val_post, ")")) %>%
group_by(date_col) %>%
summarise(cal_val = paste(cal_val, collapse = ","))
which gives
df2
date_col cal_val
1 1/12/2017 0:15 (0_04),(1_08),(0_12),(1_14)
2 1/12/2017 0:30 (0_22),(1_29)
3 1/12/2017 0:45 (1_38),(0_40),(1_44)
4 1/12/2017 1:00 (1_57),(0_59)
5 1/12/2017 1:15 (0_07),(1_14)
Sample data:
df <- structure(list(date_col = c("1/12/2017 0:15", "1/12/2017 0:30",
"1/12/2017 0:45", "1/12/2017 1:00", "1/12/2017 1:15"), val = c("(0_04),(1_08),(0_12),(1_14)",
"(0_22),(0_25),(1_29)", "(1_34),(1_38),(0_40),(1_44)", "(1_47),(1_49),(1_53),(1_57),(0_59)",
"(0_07),(0_09),(0_10),(0_13),(1_14)")), .Names = c("date_col",
"val"), class = "data.frame", row.names = c(NA, -5L))

Creating conditional sums (based on dates) as new column of data frame in R

I'm trying to do some feature engineering in R. Let's assume I have the following dataframe:
events = data.frame(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))`
I now want to add a column with the sum of "Inpatient" events from the same patient in the previous 30 days.
Is there a straight-forward way of doing this (that doesn't involve for loops)?
Given your data set I would create some handle variables and run a data.table approach.
First I add the date of last period by patient. Then, I sum how many times "Inpatient" appears in the data set by patient and dates of last period that are sooner than 30 days from the current date.
library(data.table)
events = data.table(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))
events = events[order(date), .SD, by = patient]
events[, date_t1 := lag(date), by = patient]
events[, timesInpatient := cumsum(type=="Inpatient"), by = .(patient, date_t1 > date - 30)]
The result looks like this
patient date type date1 timesInpatient
1: B 2017-12-12 AnE <NA> 0
2: B 2017-12-12 AnE 2017-12-12 0
3: B 2018-02-01 Inpatient 2017-12-12 1
4: A 2017-12-15 AnE <NA> 0
5: A 2018-01-09 Inpatient 2017-12-15 1
6: A 2018-01-31 Inpatient 2018-01-09 2
7: A 2018-02-05 Inpatient 2018-01-31 3
This might be a little less terse than the data.table approach, but you could potentially use span and %within% from the lubridate package.
Here's an example of how they work:
# creating a span object and a vector of dates
span <- lubridate::interval("2018-01-01", "2018-01-30")
dates <- as.Date(c("2018-01-01", "2018-01-30", "2018-01-03", "2018-02-01"))
dates %within% span
[1] TRUE TRUE TRUE FALSE
# adding a vector indicating inpatient visits
inpatient_visit <- c(TRUE, FALSE, TRUE, FALSE)
# counting dates are both fall within the span and are inpatient visits
sum(dates %within% span & visit)
[1] 2
You could then use a split-apply-combine approach (using split and purrr:map_df) and repeat this counting process for each patient in your dataset:
library(dplyr)
library(lubridate)
events = data.frame(patient = c("A","A","A","A","B","B","B"),
date = as.Date(c("2017-12-15", "2018-01-09", "2018-01-31", "2018-02-05",
"2017-12-12", "2017-12-12", "2018-02-01")),
type = c("AnE","Inpatient","Inpatient","Inpatient","AnE","AnE",
"Inpatient"))
count_visits <- function(df) {
res <- map(df$span, ~ sum(df$date %within% .x & df$inpatient))
df$count <- res
return(df)
}
events <- events %>%
mutate(inpatient = type == "Inpatient",
span = interval(date - days(30), date)) %>%
split(.$patient) %>%
map_df(count_visits) %>%
select(-inpatient, -span) %>%
arrange(date)
events
patient date type count
1 B 2017-12-12 AnE 0
2 B 2017-12-12 AnE 0
3 A 2017-12-15 AnE 0
4 A 2018-01-09 Inpatient 1
5 A 2018-01-31 Inpatient 2
6 B 2018-02-01 Inpatient 1
7 A 2018-02-05 Inpatient 3

Resources