calculating sum based on customed rules in the data frame - r

Preferably using data.table in R: I want to calculate the sum of DIAM by ID, CYCLE # based on the following rules:
if any of the DIAM for particular subject cycle is presented as NE then the SUM cant be calculated (must return NA)
if any of the DIAM is presented as NA, then calculate the sum ignoring the NA (i.e. as if it is 0)
if none is NA, then calculate the sum as normal
Also i would like to substitute CYCLE number to numeric with BASELINE representing 0.
dfin <-
ID CYCLE NUM DIAM
1 BASELINE 1 8
1 BASLEINE 2 4
1 CYCLE 1 1 6
1 CYCLE 1 2 2
1 CYCLE 2 1 6
1 CYCLE 2 2 NE
1 CYCLE 3 1 6
1 CYCLE 3 2 NA
dfout <-
ID CYCLE SUM
1 0 12
1 1 8
1 2 NA
1 3 6
This need to be applied for every subject. There are many cycles there but this just an example.

Here is one option. Grouped by 'ID', and the matched index of 'CYCLE' (as showed in the expected output), change the "DIAM" values to NA if any of 'DIAM" have "NE", then summarise by taking the sum of 'DIAM' while making sure that if all of the values are NA return NA
library(tidyverse)
dfin %>%
group_by(ID, CYCLE = match(CYCLE, unique(CYCLE))-1) %>%
mutate(DIAM = as.numeric(replace(DIAM, any(DIAM== "NE"), NA))) %>%
summarise(SUM = NA^all(is.na(DIAM)) * sum(DIAM, na.rm = TRUE))
# A tibble: 4 x 3
# Groups: ID [?]
# ID CYCLE SUM
# <int> <dbl> <dbl>
#1 1 0 12
#2 1 1 8
#3 1 2 NA
#4 1 3 6
Or use an if/else condition after the group_by step
dfin %>%
group_by(ID, CYCLE = match(CYCLE, unique(CYCLE))-1) %>%
summarise(SUM = if("NE" %in% DIAM) NA else sum(as.numeric(DIAM), na.rm = TRUE))
Or using the same logic with data.table
library(data.table)
setDT(dfin)[, .(SUM = if("NE" %in% DIAM) NA_real_ else
sum(as.numeric(DIAM), na.rm = TRUE)), .(ID, CYCLE = rleid(CYCLE)-1)]
# ID CYCLE SUM
#1: 1 0 12
#2: 1 1 8
#3: 1 2 NA
#4: 1 3 6
data
dfin <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
CYCLE = c("BASELINE",
"BASELINE", "CYCLE 1", "CYCLE 1", "CYCLE 2", "CYCLE 2", "CYCLE 3",
"CYCLE 3"), NUM = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), DIAM = c("8",
"4", "6", "2", "6", "NE", "6", NA)), row.names = c(NA, -8L),
class = "data.frame")

# Data created
dfin<-data.table("ID" = rep(x = 1,times = 8),"CYCLE" = c("BASELINE","BASELINE","CYCLE 1","CYCLE 1","CYCLE 2","CYCLE 2","CYCLE 3","CYCLE 3"),
"NUM" = rep(x = c(1,2),times = 4),"DIAM" = c(8,4,6,2,6,"NE",6,NA))
# CYCLE transformed
dfin[,CYCLE := as.numeric(ifelse(CYCLE == "BASELINE","0",
substr(x = CYCLE,start = 7,stop = 7)))]
# SUM computed
dfin2<-dfin[,.(SUM = if(CYCLE == 0){
NA_real_
} else if("NE" %in% DIAM){
NA_real_
} else {
sum(as.numeric(DIAM),na.rm = T)
}),by = c("ID","CYCLE")]
# IDs with CYCLE = 0 present have SUM updated to NA
dfin2[ID %in% ID[which(CYCLE == 0)],SUM := NA]
Hope this helps!

Related

Creating a dataframe based on condition (with a probably dependent by age)?

Im trying to create a synthetic dataset, but im struggling a bit
Is there a way to create a column based on the values in another column?
between subject design and my participant are dividend in two conditions
(condition 1 = 0 condition 2 = 1).
I want to make a column "Trial_1" = 0 = Absence, 1 = Presence, but just for the participants in one of the conditions?
df <- data.fram(
Id = seq(1, 10, by=1),
Age = sample(1:5, 10, replace = TRUE)
Condition = sample(0:1, 10, replace = TRUE)
Trial_1 = sample(0:1, 10, replace = TRUE, prob = c(0.3, 0.7)))
##BUT, want Trial_1 just do it for partisans' in in condition = 1
And if there is an easy way to make the probability based on age, that would be amazing!
Thanks in advance!
You can create df with Id, Age, Condition columns first, and then use rowwise() and mutate() (both from dplyr package) to create Trial_1.
library(dplyr)
df %>%
rowwise() %>%
mutate(Trial_1 = sample(0:1, 1, prob=c(1-Age/10, Condition*Age/10)))
Here, note that the probability of 0 and 1 is 1-Age/10 and Age/10, respectively, to make it age-dependent; you would want to change this to whatever dependence on age you would like.
Also, note that I multiply the probability corresponding to 1 by Condition, ensuring that Condition=0 rows always get 0.
Output:
Id Age Condition Trial_1
<dbl> <int> <int> <int>
1 1 1 0 0
2 2 3 1 1
3 3 1 0 0
4 4 4 1 0
5 5 3 1 0
6 6 5 1 1
7 7 4 1 0
8 8 5 1 0
9 9 3 0 0
10 10 2 1 0
If you prefer those rows to be NA, then do something like this instead:
df %>%
rowwise() %>%
mutate(Trial_1 = if_else(Condition==1, sample(0:1, 1, prob=c(1-Age/10, Age/10)), NA_integer_))
Output:
Id Age Condition Trial_1
<dbl> <int> <int> <int>
1 1 1 0 NA
2 2 3 1 1
3 3 1 0 NA
4 4 4 1 1
5 5 3 1 0
6 6 5 1 1
7 7 4 1 0
8 8 5 1 0
9 9 3 0 NA
10 10 2 1 0
Input:
structure(list(Id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), Age = c(1L,
3L, 1L, 4L, 3L, 5L, 4L, 5L, 3L, 2L), Condition = c(0L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
I'd do it in two steps - first create the dataframe and then the Trial column. My solution isn't super elegant, but it's straightforward and doesn't require anything but base R. I hope it helps.
df <- data.frame(
Id = seq(1, 10, by = 1),
Age = sample(1:5, 10, replace = TRUE),
Condition = sample(0:1, 10, replace = TRUE)
)
df$Trial[df$Condition == 1] <- sample(0:1, sum(df$Condition), prob = c(0.3, 0.7), replace = TRUE)
# more generally, if you want to assign to Trial only when Condition is x
# df$Trial[df$Condition == x] <- sample(0:1, sum(df$Condition == x), prob = c(0.3, 0.7), replace = TRUE)

How to group by a variable and see if they have another observation within a given time frame, R

I have the something like the following:
person_ID visit date
1 2/25/2001
1 2/30/2001
1 4/2/2001
2 3/18/2004
3 9/22/2004
3 10/27/2004
3 5/15/2008
I want to add another column to see if the person has a reoccurring observation within 90 days, like:
person_ID visit date reoccurrence
1 2/25/2001 1
1 2/30/2001 1
1 4/2/2001 0
2 3/18/2004 0
3 9/22/2004 1
3 10/27/2004 0
3 5/15/2008 0
any help is appreciated, thank you!
If the second 'date' is not 2/30/2001, convert the 'visit_date' to Date class, grouped by 'person_id', get the difference between current and next 'visit_date' in 'day', check if it is less than 90, replace the NA with 0
library(dplyr)
library(lubridate)
library(tidyr)
df1 <- df1 %>%
mutate(visit_date = mdy(visit_date)) %>%
group_by(person_ID) %>%
mutate(reoccurrence = replace_na(+(difftime(lead(visit_date),
visit_date, units = 'day') < 90), 0)) %>%
ungroup
-output
# A tibble: 7 x 3
# person_ID visit_date reoccurrence
# <int> <date> <dbl>
#1 1 2001-02-25 1
#2 1 2001-02-28 1
#3 1 2001-04-02 0
#4 2 2004-03-18 0
#5 3 2004-09-22 1
#6 3 2004-10-27 0
#7 3 2008-05-15 0
Or using data.table
library(data.table)
setDT(df1)[, visit_date := as.IDate(visit_date, '%m/%d/%Y')
][, reoccurence := +(difftime(shift(visit_date, type = 'lead'),
visit_date, units = 'day') < 90))
][is.na(reoccurence), reoccurence := 0]
Or with base R
df1$visit_date <- as.Date(df1$visit_date, '%m/%d/%Y')
with(df1, ave(as.integer(visit_date), person_ID, FUN =
function(x) c(+(diff(x) < 90), 0)))
#[1] 1 1 0 0 1 0 0
data
df1 <- structure(list(person_ID = c(1L, 1L, 1L, 2L, 3L, 3L, 3L), visit_date = c("2/25/2001",
"2/28/2001", "4/2/2001", "3/18/2004", "9/22/2004", "10/27/2004",
"5/15/2008")), row.names = c(NA, -7L), class = "data.frame")
Base R variant:
reoccur <- function(x, lim=90) {
m <- outer(x, x, `-`)
m[upper.tri(m, diag=TRUE)] <- NA
colSums(!is.na(m) & m >= 0 & m <= lim) > 0
}
### make your dates *dates*
dat$visit <- as.Date(dat$visit, format="%m/%d/%Y")
### calculate if you have reoccurrences
ave(as.numeric(dat$visit), dat$person_ID, FUN=reoccur)
# [1] 1 1 0 0 1 0 0
Data:
dat <- structure(list(person_ID = c(1L, 1L, 1L, 2L, 3L, 3L, 3L), visit = c("2/25/2001", "2/27/2001", "4/2/2001", "3/18/2004", "9/22/2004", "10/27/2004", "5/15/2008")), class = "data.frame", row.names = c(NA, -7L))
(I changed "2/30/2001" to "2/27/2001" to get a real Date out of it.)

Find multiple "switching points" by comparing the answers in columns

I have a data set in which subjects have made choices between A and B for 13 different B's. Below is a simplified example of what the data looks like with 54 subjects and 5 choices. (1 is A, 2 is B).
subject choice1 choice2 choice3 choice4 choice5
1 1 1 1 1 2 2
2 2 1 1 2 2 2
3 3 1 2 1 2 2
4 4 1 2 2 2 2
I would like to find the questions in which subjects switch option A to B , i.e. for subject 1 this would be choice4.
In a previous study we did this by computing number of times the subject would choose option A and then selecting the corresponding option B form a separate matrix. See code below.
However, the difference now is that instead of choosing 1 switching point, subjects were asked the questions in a randomized order, and thus there is the possibility of having multiple switching points. For example in the table above, subject 3 switches to B at choice2 and again at choice4.
I would like to find both the first time the subject switches to option B, and the last time (before sticking with B for the rest of the choices).
sure_amounts <- matrix(nrow = 4, ncol = 13) # 4 treatments, 13 questions
sure_amounts[1, ] <- c(0, 2, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 8, 10) # Option B's
sure_amounts[2, ] <- seq(2, 14, 1)
sure_amounts[3, ] <- seq(2, 14, 1)
sure_amounts[4, ] <- seq(2, 14, 1)
b_choice <- matrix(nrow = 201, ncol = 4)
switch_choice <- matrix(nrow = 201, ncol = 4) # switching point form A to B
for(j in 1:4){ # number of treatments
for(i in 201){ # number of subjects
choice = NULL
fl = data$ID == i
k = 1 + 36*(j-1) # 36 before going to the next treatment (due to other questions)
choice = c(data[fl,k:(k+12)])
b_choice[i,j] = length(choice[choice==1])
temp = b_choice[i,j]
switch_choice[i,j] <- ifelse(temp==0, 0, sure_amounts[j, temp])
}
}
Does anyone have any tips on how to approach this? Thanks in advance!
I am not sure how you want your expected output to look like but you can try to get data in long format and for each subject select rows where they switch from 1 -> 2.
library(dplyr)
df %>%
tidyr::pivot_longer(cols = -subject) %>%
group_by(subject) %>%
filter(value == 2 & lag(value) == 1 |
value == 1 & lead(value) == 2)
# subject name value
# <int> <chr> <int>
# 1 1 choice3 1
# 2 1 choice4 2
# 3 2 choice2 1
# 4 2 choice3 2
# 5 3 choice1 1
# 6 3 choice2 2
# 7 3 choice3 1
# 8 3 choice4 2
# 9 4 choice1 1
#10 4 choice2 2
Here we can see that subject 1 moves from 1 -> 2 from choice3 -> choice4 and so on.
data
df <- structure(list(subject = 1:4, choice1 = c(1L, 1L, 1L, 1L), choice2 = c(1L,
1L, 2L, 2L), choice3 = c(1L, 2L, 1L, 2L), choice4 = c(2L, 2L,
2L, 2L), choice5 = c(2L, 2L, 2L, 2L)), class = "data.frame",
row.names = c(NA, -4L))
A Base R solution:
Essentially this code only substracts a lag of the decisions and detects if the difference is not equal to zero.
Code:
lapply(as.data.frame(t(df_1)[-1,]), function(x){
t <- x - c(x[-1], 0) # row substracted by shortened row
z <- which(t[-length(t)] != 0) # values not equal to zero and rm last value
z + 1 # remove lag
})
# $`1`
# [1] 4
# $`2`
# [1] 3
# $`3`
# [1] 2 3 4
# $`4`
# [1] 2
Data:
df_1 <- read.table(text = " subject choice1 choice2 choice3 choice4 choice5
1 1 1 1 1 2 2
2 2 1 1 2 2 2
3 3 1 2 1 2 2
4 4 1 2 2 2 2 ", header = T)
An alternative approach:
library(dplyr)
library(stringr)
library(purrr)
df %>%
mutate(g = paste0(choice1, choice2, choice3, choice4, choice5),
switches = as.character(map(g, ~pluck(str_locate_all(.x, "12"), 1)))) %>%
select(-g)
#> subject choice1 choice2 choice3 choice4 choice5 switches
#> 1 1 1 1 1 2 2 3:4
#> 2 2 1 1 2 2 2 2:3
#> 3 3 1 2 1 2 2 c(1, 3, 2, 4)
#> 4 4 1 2 2 2 2 1:2
data
df <- structure(list(subject = 1:4, choice1 = c(1L, 1L, 1L, 1L), choice2 = c(1L,
1L, 2L, 2L), choice3 = c(1L, 2L, 1L, 2L), choice4 = c(2L, 2L,
2L, 2L), choice5 = c(2L, 2L, 2L, 2L)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
Created on 2020-07-10 by the reprex package (v0.3.0)

R find consecutive months

I'd like to find consecutive month by client. I thought this is easy but
still can't find solutions..
My goal is to find months' consecutive purchases for each client. Any
My data
Client Month consecutive
A 1 1
A 1 2
A 2 3
A 5 1
A 6 2
A 8 1
B 8 1
In base R, we can use ave
df$consecutive <- with(df, ave(Month, Client, cumsum(c(TRUE, diff(Month) > 1)),
FUN = seq_along))
df
# Client Month consecutive
#1 A 1 1
#2 A 1 2
#3 A 2 3
#4 A 5 1
#5 A 6 2
#6 A 8 1
#7 B 8 1
In dplyr, we can create a new group with lag to compare the current month with the previous month and assign row_number() in each group.
library(dplyr)
df %>%
group_by(Client,group=cumsum(Month-lag(Month, default = first(Month)) > 1)) %>%
mutate(consecutive = row_number()) %>%
ungroup %>%
select(-group)
We can create a grouping variable based on the difference in adjacent 'Month' for each 'Client' and use that to create the sequence
library(dplyr)
df1 %>%
group_by(Client) %>%
group_by(grp =cumsum(c(TRUE, diff(Month) > 1)), add = TRUE) %>%
mutate(consec = row_number()) %>%
ungroup %>%
select(-grp)
# A tibble: 7 x 4
# Client Month consecutive consec
# <chr> <int> <int> <int>
#1 A 1 1 1
#2 A 1 2 2
#3 A 2 3 3
#4 A 5 1 1
#5 A 6 2 2
#6 A 8 1 1
#7 B 8 1 1
Or using data.table
library(data.table)
setDT(df1)[, grp := cumsum(c(TRUE, diff(Month) > 1)), Client
][, consec := seq_len(.N), .(Client, grp)
][, grp := NULL][]
data
df1 <- structure(list(Client = c("A", "A", "A", "A", "A", "A", "B"),
Month = c(1L, 1L, 2L, 5L, 6L, 8L, 8L), consecutive = c(1L,
2L, 3L, 1L, 2L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-7L))

subsetting duplicates per individual

dfin <-
STUDY ID CYCLE TIME VALUE
1 1 0 10 50
1 1 0 20 20
1 2 1 20 20
Per study and ID, for those who have duplicate CYCLE == 0 values, remove the row that had the higher TIME.
dfout <-
STUDY ID CYCLE TIME VALUE
1 1 0 10 50
1 2 1 20 20
Using RStudio.
An option is to do a group by 'STUDY', 'ID' and filter out the duplicated 0 values in 'CYCLE'
library(dplyr)
dfin %>%
arrange(STUDY, ID, TIME) %>%
group_by(STUDY, ID) %>%
filter(!(duplicated(CYCLE) & CYCLE == 0))
# A tibble: 2 x 5
# Groups: STUDY, ID [2]
# STUDY ID CYCLE TIME VALUE
# <int> <int> <int> <int> <int>
#1 1 1 0 10 50
#2 1 2 1 20 20
Also, if there are many duplicates for 0 and want to remove only the row where 'TIME' is also max
dfin %>%
group_by(STUDY, ID) %>%
filter(!(TIME == max(TIME) & CYCLE == 0))
Or using base R
dfin1 <- do.call(order, dfin[c("STUDY", "ID", "TIME")])
dfin1[!(duplicated(dfin1[1:3]) & duplicated(dfin1$CYCLE)),]
# STUDY ID CYCLE TIME VALUE
#1 1 1 0 10 50
#3 1 2 1 20 20
data
dfin <- structure(list(STUDY = c(1L, 1L, 1L), ID = c(1L, 1L, 2L), CYCLE = c(0L,
0L, 1L), TIME = c(10L, 20L, 20L), VALUE = c(50L, 20L, 20L)),
class = "data.frame", row.names = c(NA,
-3L))

Resources