case_when when there are factors - r

I am trying to combine treatment allocations for patients who completed two different randomisation forms. I can simulate some example data here:
data <- data.frame(id = 1:100,
trt_a = factor(c(sample(0:1, 50, TRUE), rep(NA, 50))),
trt_b = factor(c(sample(0:1, 50, TRUE), rep(NA, 50))),
trt_ab = factor(c(rep(NA, 50), sample(c("a", "b", "ab", "neither"), 50, TRUE))))
Is there any way of creating a new column with the same factor levels as trt_ab? Half the patients had choice of either trt_a or trt_b, and the other half had choice trt_ab. I want to use some sort of case_when statement to generate a new column with the actual treatment choices:
data %>%
mutate(trt = case_when(trt_a == 0 & trt_b == 0 ~ "neither",
trt_a == 1 & trt_b == 0 ~ "a",
trt_a == 0 & trt_b == 1 ~ "b",
trt_a == 1 & trt_b == 1 ~ "ab",
!is.na(trt_ab) ~ trt_ab))
However, when any of the columns are factors, I get the following error:
Error in `mutate()`:
! Problem while computing `trt = case_when(...)`.
Caused by error in `` names(message) <- `*vtmp*` ``:
! 'names' attribute [1] must be the same length as the vector [0]

data %>%
mutate(trt = case_when(trt_a == 0 & trt_b == 0 ~ "neither",
trt_a == 1 & trt_b == 0 ~ "a",
trt_a == 0 & trt_b == 1 ~ "b",
trt_a == 1 & trt_b == 1 ~ "ab",
!is.na(trt_ab) ~ trt_ab)) %>% head
-output
id trt_a trt_b trt_ab trt
1 1 0 0 <NA> neither
2 2 0 0 <NA> neither
3 3 1 1 <NA> ab
4 4 1 1 <NA> ab
5 5 0 1 <NA> b
6 6 1 1 <NA> ab

Related

Combine Two Variables Into One Based on their Values in R

I am trying to create a new variable out of two variables in the same dataframe (df), like those below. The categories are mutually exclusive.
VAR1 VAR2
1 1
2 2
6 6
1 = yes
2 = no
6 = did not answer
The script I have tried to get the combined variable, but is not working is below:
if (df$VAR1 == 1) {
df$combo = 1
} else if (df$VAR2 == 1) {
df$combo = 2
} else if ((df$VAR1 == 2) & (df$VAR2 == 2)) {
df$combo = 3
} else if ((df$VAR1 == 6) & (df$VAR2 == 6)) {
df$combo = 6
}
Any pointers will be appreciated.
You may try
for (i in 1:nrow(df)){
if (df$VAR1[i] == 1) {
df$combo[i] = 1
} else if (df$VAR2[i] == 1) {
df$combo[i] = 2
} else if ((df$VAR1[i] == 2) & (df$VAR2[i] == 2)) {
df$combo[i] = 3
} else if ((df$VAR1[i] == 6) & (df$VAR2[i] == 6)) {
df$combo[i] = 6
}
}
VAR1 VAR2 combo
1 1 1 1
2 2 2 3
3 6 6 6
Or use dplyr
library(dplyr)
df %>%
mutate(combo = case_when(
VAR1 == 1 ~ 1,
VAR2 == 1 ~ 2,
(VAR1 == 2 & VAR2 == 2) ~ 3,
(VAR1 == 6 & VAR2 == 6) ~ 6,
TRUE ~ NA_real_
))
VAR1 VAR2 combo
1 1 1 1
2 2 2 3
3 6 6 6

Summarize output of table to simple columns

I currently have this table and I want to sum the total number of purchases per each ID.
Input:
id
purchases
time
a
need
1:00
a
want
1:30
a
none
2:00
b
need
1:15
b
want
1:30
c
none
1:10
c
none
1:30
d
none
2:00
d
need
2:10
d
want
2:15
d
none
2:35
e
none
3:10
e
none
3:50
f
need
2:55
f
want
3:15
f
need
3:20
the purchases column was primarily not existent and instead there were item names. so I created this column first and then proceeded to try to reach the below output
Desired first output: total items bought, number of needs and wants separately, the output column is yes if first purchase is a need, no if it isn't, none if there were no purchases
id
total
need
want
output
a
2
1
1
yes
b
2
1
1
yes
c
0
0
0
none
d
2
1
1
no
e
0
0
0
none
f
3
2
1
yes
I am using dplyr so I would appreciate the suggested code to be feasible for adding in a dplyr pipeline.
What I tried to do
actions %>% group_by (id) %>% arrange(id) %>%
mutate(purchases = ifelse(type == "Buy" & obj_category == "Books" | type == "Buy" & obj_category == "Car" | type=="Buy" & obj_category == "Business" | type == "Buy", "need",
ifelse(type == "Buy" & obj_category == "Sweets" | type == "Buy" & obj_category == "Electronics" | type == "Buy" & obj_category == "Business" | type == "Buy" & obj_category == "House", "want", "none"))) %>%
summarise(need = ifelse(purchases == "need", 1, 0),
want = ifelse(purchases == "want", 1, 0))
thank you in advance
You could try
library(dplyr)
df %>%
group_by(id) %>%
summarise(need = sum(purchases == "need"),
want = sum(purchases == "want"),
total = need + want,
output = case_when(first(purchases) == "need" ~ "yes",
total == 0 ~ "none",
TRUE ~ "no"))
# # A tibble: 6 × 5
# id need want total output
# <chr> <int> <int> <int> <chr>
# 1 a 1 1 2 yes
# 2 b 1 1 2 yes
# 3 c 0 0 0 none
# 4 d 1 1 2 no
# 5 e 0 0 0 none
# 6 f 2 1 3 yes
A general version if there are more categories in purchases:
library(dplyr)
library(janitor)
df %>%
tabyl(id, purchases) %>%
select(-none) %>%
adorn_totals("col") %>%
left_join(
df %>% group_by(id) %>%
summarise(output = case_when(purchases[1] == "need" ~ "yes",
all(purchases == "none") ~ "none",
TRUE ~ "no")))
Data
df <- structure(list(id = c("a", "a", "a", "b", "b", "c", "c", "d",
"d", "d", "d", "e", "e", "f", "f", "f"), purchases = c("need",
"want", "none", "need", "want", "none", "none", "none", "need",
"want", "none", "none", "none", "need", "want", "need"), time = c("1:00",
"1:30", "2:00", "1:15", "1:30", "1:10", "1:30", "2:00", "2:10",
"2:15", "2:35", "3:10", "3:50", "2:55", "3:15", "3:20")), class = "data.frame", row.names = c(NA, -16L))
Here's a solution with dplyr and janitor:
library(dplyr)
library(janitor)
df %>%
janitor::tabyl(id, purchases) %>%
left_join(df %>% group_by(id) %>% slice(1), by = "id") %>%
rowwise() %>%
mutate(total = sum(c_across(need:want))) %>%
ungroup() %>%
mutate(purchases = ifelse(purchases == "need", "yes", "no"),
purchases = ifelse(total == 0, "none", purchases)) %>%
select(-c(time, total))
Which gives:
# A tibble: 6 × 5
id need none want purchases
<chr> <dbl> <dbl> <dbl> <chr>
1 a 1 1 1 yes
2 b 1 0 1 yes
3 c 0 2 0 no
4 d 1 2 1 no
5 e 0 2 0 no
6 f 2 0 1 yes

How do I see how many combinations of specific values there are in columns

I have a dataset that looks like this:
Group ID
UP 1
UP 1
UP 2
UP 2
UP 2
UP 1
UP 1
UP 2
UP 2
UP 1
UP 1
Is there any way to see how many times a 1 is under a 1 in the ID column?
Does this work:
library(dplyr)
df %>% mutate(flag = case_when(ID == 1 & lag(ID) == 1 ~ 1, TRUE ~ 0)) %>% pull(flag) %>% sum
[1] 3
Base R :
sum(df$ID == 1 & c(tail(df$ID, -1), NA) == 1, na.rm = TRUE)
#[1] 3
You can also use dplyr::lag and data.table::shift
sum(df$ID == 1 & dplyr::lag(df$ID) == 1, na.rm = TRUE)
sum(df$ID == 1 & data.table::shift(df$ID) == 1, na.rm = TRUE)

How to create new columns conditional of columns in a df and sum them together to one in R

I am quite new to R and have a df, in which I am creating some criteria (a1, b1, c1, d1.. and so on) by using sqldf (In this example I am only showing a1 to c1)
df <- data.frame('var1' = c('x','1', 'X', '', 'X'), "var2" = c('y','3', '', 'X', ''), "var3" = c('y','7', '', 'X', 'X'))
library(sqldf)
testcases_sql <-
("
CASE WHEN var1 = 1 THEN 1 ELSE 0 END AS a1,
CASE WHEN var1 = 1 AND var2 = 'y' THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 3 THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 3 THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 'X' THEN 1 ELSE 0 END AS b1,
CASE WHEN var1= 1 AND var2= 'X' AND var3=7 THEN 1 ELSE 0 END AS c1,
CASE WHEN var1= 'X' AND var3='X' THEN 1 ELSE 0 END AS c1")
sql_string = paste("SELECT *" , ",", testcases_sql, " FROM ", "df", sep=" ")
#create criteria
data = sqldf(sql_string)
head(data)
SQLDF create a new column for each criteria
head(data)
# var1 var2 var3 a1 b1 b1 b1 b1 c1 c1
# 1 x y y 0 0 0 0 0 0 0
# 2 1 3 7 1 0 1 1 0 0 0
# 3 X 0 0 0 0 0 0 0
# 4 X X 0 0 0 0 0 0 0
# 5 X X 0 0 0 0 0 0 1
but I need all the criteria in one variable, so that all the b1's are in one column, all the c1's are in one and so on. It does not matter how many times each row meets the criterion, I only need a '1' in each column. In my original df, there is no system in how many times a criteria can be repeated, it is totally random.
My expected results are:
wished_df <- data.frame('var1' = c('x','1', 'X', '', 'X'), "var2" = c('y','3', '', 'X', ''), "var3" = c('y','7', '', 'X', 'X'), "a1" = c('0','1', '0', '0', '0'), "b1=" =c('0','1', '0', '0','0'), "c1=" =c('0','0', '0', '0','1') )
head(wished_df)
# var1 var2 var3 a1 b1 c1
#1 x y y 0 0 0
#2 1 3 7 1 1 0
#3 X 0 0 0
#4 X X 0 0 0
#5 X X 0 0 1
It might be that sqldf is not the best function for this. My best approach would be to change the df afterwards by summing together the variabels
#sum the variable
data$newb1 <- data$b1 + data$b1 + data$b1 + data$b1
#error in `$<-.data.frame`(`*tmp*`, newb1, value = numeric(0)) : replacement has 0 rows, data has 5
#delete the old variable
data$b1 <- data$b1 <-data$b1 <- data$b1 <- NULL
#rename the variable
data$b1 <- data$newb1
#delete old variable
data$newb1 <- NULL
#repeat for c1, d1, e1 and so on...
data$newc1 <- data$c1 + data$c1
data$c1 <- data$c1 <- NULL
data$c1 <- data$newc1
data$newc1 <- NULL
Which is not working, and is quite ugly and will require a lot of code ( I have 80 testcases).
Is there an easier way to do this?
Thank a lot in advance
In SQL we can OR the conditions to simplify the code. Each true condition will evaluate to 1 and each false condition to 0. We have changed the name of the SQL string to testcasesSQL because $ string interpolation requires word characters for the variable name -- non-word characters terminate the variable name and are not regarded as part of the variable name.
If there were some pattern to the test cases then we could generate the testcasesSQL string using R code but it is unclear if there is and we just fix the code in the question and translate it to more compact SQL.
Note that the logical condition (var1 = 1) or (var1 = 1 AND var2 = 'y') can be simplified to just (var1 = 1) . Below we have NOT applied this or other potential logical simplifications to make it clear how the code in the question translates directly to simpler SQL. Also if these are generated automatically it may not be in the simplest form anyways and from the viewpoint of the answer it makes no difference.
library(sqldf)
testcasesSQL <- "(var1 = 1) or (var1 = 1 AND var2 = 'y') as a1,
(var1 = 1 AND var2 = 'y') or (var1 = 1 AND var2 = 3) or (var1 = 1 AND var2 = 'X') AS b1,
(var1 = 1 AND var2 = 'X' AND var3 = 7) or (var1 = 'X' AND var3 ='X') AS c1"
dfname <- "df"
fn$sqldf("select *, $testcasesSQL from $dfname")
giving:
var1 var2 var3 a1 b1 c1
1 x y y 0 0 0
2 1 3 7 1 1 0
3 X 0 0 0
4 X X 0 0 0
5 X X 0 0 1
Generating the condition
We can define a matrix that has the condition name as column 1 with a column for var1, var2 and var3 such that the conditions on one row are AND'd and the conditions on multiple rows having the same condition name OR'd. From the example in the question it seems that var1 is always present and we have used that fact in the gsub line.
condmat <- matrix(c('c1', 1, NA, NA,
'c1', 1, 'y', NA,
'c2', 1, 'y', NA,
'c2', 1, 3, NA,
'c2', 1, 'X', NA,
'c3', 1, 'X', 7,
'c3', 'X', NA, 'X'),, 4, byrow = TRUE)
colnames(condmat) <- c("cond", "var1", "var2", "var3")
s <- sprintf("(%s = '%s' AND %s = '%s' AND %s = '%s')",
colnames(condmat)[2], condmat[, 2],
colnames(condmat)[3], condmat[, 3],
colnames(condmat)[4], condmat[, 4])
s2 <- gsub("AND \\w+ = 'NA'", "", s)
s3 <- tapply(s2, condmat[, 1], paste, collapse = " OR ")
cond <- paste(paste(s3, 'as', names(s3)), collapse = ",\n")
dfname <- "df"
fn$sqldf("select *, $cond from $dfname")
Note that the cond variable that is generated by the above is:
cat(cond)
## (var1 = '1' ) OR (var1 = '1' AND var2 = 'y' ) as c1,
## (var1 = '1' AND var2 = 'y' ) OR (var1 = '1' AND var2 = '3' ) OR (var1 = '1' AND var2 = 'X' ) as c2,
## (var1 = '1' AND var2 = 'X' AND var3 = '7') OR (var1 = 'X' AND var3 = 'X') as c3
I would just use R's built-in boolean operators for this task. Note I have removed some logical redundancy from your SQL selections:
df <- data.frame('var1' = c('x','1', 'X', '', 'X'),
"var2" = c('y','3', '', 'X', ''),
"var3" = c('y','7', '', 'X', 'X'))
df$a1 <- 1 * (df$var1 == "1")
df$b1 <- 1 * ((df$var1 == "1") & (df$var2 == "y" | df$var2 == "3" | df$var2 == "X"))
df$c1 <- 1 * ((df$var1 == "1" & df$var2 == "X" & df$var3 == "7") |
(df$var1 == "X" & df$var3 == "X"))
df
#> var1 var2 var3 a1 b1 c1
#> 1 x y y 0 0 0
#> 2 1 3 7 1 1 0
#> 3 X 0 0 0
#> 4 X X 0 0 0
#> 5 X X 0 0 1
Created on 2020-05-14 by the reprex package (v0.3.0)

group_by and apply code to each element in group using conditions

I have data like so:
ID membership AdultChild
1 1 A
2 1 A
3 2 A
4 2 C
5 2 C
6 3 A
7 3 A
: : :
I want to group by membership and apply a 'code' after counting the AdultChild variable i.e.
ID membership AdultChild code
1 1 A x1
2 1 A x1
3 2 A x2
4 2 C x2
5 2 C x2
6 3 A x1
7 3 A x1
: : : :
I will have conditions similar to:
count <- function(x){
if(sum(x == "A") == 2 && sum(x == "C") == 0){
code <<- x1
}else if (sum(x == "A") == 1 & sum(x == "C") >= 1){
code <<- x2
}else {
code <<- X3
}
I have tried using dplyr to group and mutate, using the function above to add a new variable called code. I also thought about using the aggregate function but didn't have much luck.
df.2 <- df %>% group_by(membership)
%>% mutate(n = count(AdultChild)) %>%
ungroup()
df.2 <- aggregate.data.frame(df, by = membership, FUN =
count(df$AdultChild))
Basically, I want a new variable which be decided using certain conditions and applied to each ID when grouped by membership.
Thanks in advance.
library(dplyr)
df %>% group_by(membership) %>%
mutate(code=case_when(
sum(AdultChild=='A', na.rm = T)==2 & sum(AdultChild=='C', na.rm = T)==0 ~ 'X1',
sum(AdultChild=='A', na.rm = T)==1 & sum(AdultChild=='C', na.rm = T)>=1 ~ 'X2',
TRUE ~ 'X3'
))
# A tibble: 7 x 4
# Groups: membership [3]
ID membership AdultChild code
<int> <int> <fct> <chr>
1 1 1 A X1
2 2 1 A X1
3 3 2 A X2
4 4 2 C X2
5 5 2 C X2
6 6 3 A X1
7 7 3 A X1
count <- function(x){
if(sum(x == "A", na.rm = T) == 2 & sum(x == "C", na.rm = T) == 0){
y <- "4"
} else if (sum(x == "A", na.rm = T) > 2 & sum(x == "C", na.rm = T) == 0){
y <- "5"
}else if (sum(x == "A", na.rm = T) == 1 & sum(x == "C", na.rm = T) >= 1){
y <- "6"
}else if (sum(x == "A", na.rm = T) == 2 & sum(x == "C", na.rm = T) <= 3 & sum(x == "C", na.rm = T) >= 1){
y <- "7"
}else {
y <- "8"
}
}
df.2 <- df %>% group_by(membership) %>% mutate(code = count(AdultChild)) %>% ungroup()

Resources