Replace NA with 0 depending on group (rows) and variable names (column) - r

I have a large data set and want to replace many NAs, but not all.
In one group i want to replace all NAs with 0.
In the other group i want to replace all NAs with 0, but only in variables that do not include a certain part of the variable name e.g. 'b'
Here is an example:
group <- c(1,1,2,2,2)
abc <- c(1,NA,NA,NA,NA)
bcd <- c(2,1,NA,NA,NA)
cde <- c(5,NA,NA,1,2)
df <- data.frame(group,abc,bcd,cde)
group abc bcd cde
1 1 1 2 5
2 1 NA 1 NA
3 2 NA NA NA
4 2 NA NA 1
5 2 NA NA 2
This is what i want:
group abc bcd cde
1 1 1 2 5
2 1 0 1 0
3 2 NA NA 0
4 2 NA NA 1
5 2 NA NA 2
This is what i tried:
#set 0 in first group: this works fine
df[is.na(df) & df$group==1] <- 0
#set 0 in second group but only if the variable name includes b: does not work
df[is.na(df) & df$group==2 & !grepl('b',colnames(df))] <- 0
dplyr solutions are welcome as well as basic

For the second group, create the column index with grep and use that to subset the data while assigning
j1 <- !grepl('b',colnames(df))
df[j1][df$group == 2 & is.na(df[j1])] <- 0
df
# group abc bcd cde
#1 1 1 2 5
#2 1 0 1 0
#3 2 NA NA 0
#4 2 NA NA 1
#5 2 NA NA 2

Using dplyr::mutate_at you can also do:
library(dplyr)
vars_mutate_1 <- names(df)[-1]
vars_mutate_2 <- grep(x = names(df)[-1], pattern = '^(?!.*b).*$', perl = TRUE, value = TRUE)
df %>%
mutate_at(.vars = vars_mutate_1, .funs = funs(if_else(group == 1 & is.na(.), 0, .))) %>%
mutate_at(.vars = vars_mutate_2, .funs = funs(if_else(group == 2 & is.na(.), 0, .)))
group abc bcd cde
1 1 1 2 5
2 1 0 1 0
3 2 NA NA 0
4 2 NA NA 1
5 2 NA NA 2

Alternatively, you can use:
library(dplyr)
df2 <- df %>% mutate_at(vars(names(df)[-1]),
function(x) case_when((group==1 & is.na(x) ) ~ 0,
(group==2 & is.na(x) & !grepl("b",deparse(substitute(x)))) ~ 0,
TRUE ~ x))
> df2
group abc bcd cde
1 1 1 2 5
2 1 0 1 0
3 2 NA NA 0
4 2 NA NA 1
5 2 NA NA 2

Related

Recode and sum to NA when all values are NA in R

I need to assign NA when all the columns are empty in summation for each id.
Here is how my sample dataset looks like;
df <- data.frame(id = c(1,2,3),
i1 = c(1,NA,0),
i2 = c(1,NA,1),
i3 = c(1,NA,0),
total = c(3,0,1))
> df
id i1 i2 i3 total
1 1 1 1 1 3
2 2 NA NA NA 0
3 3 0 1 0 1
For the second id the total should be NA instead of 0 because all the values are NA for the second id. How can I change the dataset to below?
> df1
id i1 i2 i3 total
1 1 1 1 1 3
2 2 NA NA NA NA
3 3 0 1 0 1
We could create a condition with if_all in case_when to return NA when all the column values are NA for a row or else do the rowSums with na.rm = TRUE
library(dplyr)
df %>%
mutate(total = case_when(if_all(i1:i3, is.na) ~ NA_real_,
TRUE ~ rowSums(across(i1:i3), na.rm = TRUE)))
-output
id i1 i2 i3 total
1 1 1 1 1 3
2 2 NA NA NA NA
3 3 0 1 0 1

How to recode multiple choice answers in R using dplyr?

I have a dataset with answers to a large number of multiple choice questions. I now want to recode these answers in either true (1) or false (0). I
`#ID q1 q2 q3 cq1 cq2 cq3
#1 1 2 1 NA NA NA
#2 1 2 2 NA NA NA
#3 2 2 2 NA NA NA
#4 1 2 1 NA NA NA`
what I want is this:
`#ID q1 q2 q3 cq1 cq2 cq3
#1 1 2 1 0 0 0
#2 1 2 2 0 0 1
#3 2 2 2 1 0 1
#4 1 2 1 0 0 0`
I know that I could write out all answers like this:
`data_re <- data %>%
mutate(cq1 = if_else(q1==2, 1, 0),
cq2 = if_else(q2==1, 1, 0),
cq3 = if_else(q3==2, 1, 0))`
But is there any way how to automatically do this (similar to this approach: How to mutate_at multiple columns on a condition on each value?
However, I would have to generate the variablename of the conditional variable automatically. I tried this:
`names_answer_two_correct <- c("q1", "q3")
cnames_answer_two_correct <- paste0("c", names_answer_two_correct)
for (i in 1:length(names_answer_two_correct)) {
data_re <- data %>%
mutate(names_answer_two_correct[i] = if_else(cnames_answer_two_correct[i]== 2, 1, 0))
}`
But I get "Error: unexpected '=' in:"
Does anyone know a solution?
You can use across to apply the function to multiple columns.
library(dplyr)
names_answer_two_correct <- c("q1", "q3")
data %>%
mutate(across(all_of(names_answer_two_correct),
~as.integer(.==2), .names = 'c{col}'),
cq2 = as.integer(q2==1)) -> data_re
data_re
# ID q1 q2 q3 cq1 cq2 cq3
#1 1 1 2 1 0 0 0
#2 2 1 2 2 0 0 1
#3 3 2 2 2 1 0 1
#4 4 1 2 1 0 0 0

mutate_at (or across) and ifelse statement

Similar to this question, given tmpp:
library(data.table)
library(tidyverse)
tmpp <- data.table(
"ID" = c(1,1,1,2,2),
"Date" = c(1,2,3,1,2),
"total_neg" = c(1,1,0,0,2),
"total_pos" = c(4,5,2,4,5),
"H1" = c(5,4,0,5,-5),
"H2" = c(5,-10,5,5,-5),
"H3" = c(-10,6,5,0,10)
)
tmpp
# ID Date total_neg total_pos H1 H2 H3
# 1: 1 1 1 4 5 5 -10
# 2: 1 2 1 5 4 -10 6
# 3: 1 3 0 2 0 5 5
# 4: 2 1 0 4 5 5 0
# 5: 2 2 2 5 -5 -5 10
I want to replace all variables starting with H, with NA where total_neg == 1 :
# ID Date total_neg total_pos H1 H2 H3
# 1: 1 1 1 4 NA NA NA
# 2: 1 2 1 5 NA NA NA
# 3: 1 3 0 2 0 5 5
# 4: 2 1 0 4 5 5 0
# 5: 2 2 2 5 -5 -5 10
Why don't these work?
tmpp %>%
mutate_at(vars(matches("H")), ~ifelse( .$total_neg == 1, NA, .))
tmpp %>%
mutate_at(vars(matches("H"),
.funs = list(~ ifelse(.$total_neg == 1, NA, .))))
#im guessing the first dot in the ifelse statements above is referring to the H columns so I tried:
tmpp %>%
mutate_at(vars(matches("H"),
.funs = list(~ ifelse(tmpp$total_neg == 1, NA, .))))
Happy to see across version too, thanks
A simple data.table solution that updates all the columns at once & in-place only for the subset
tmpp[total_neg == 1, grep("^H", names(tmpp)) := NA]
tmpp
# ID Date total_neg total_pos H1 H2 H3
# 1: 1 1 1 4 NA NA NA
# 2: 1 2 1 5 NA NA NA
# 3: 1 3 0 2 0 5 5
# 4: 2 1 0 4 5 5 0
# 5: 2 2 2 5 -5 -5 10
You don't need to use $ in dplyr pipe. In mutate_at/across it refers to column value. Try :
library(dplyr)
tmpp %>% mutate(across(starts_with('H'), ~replace(., total_neg == 1, NA)))
# ID Date total_neg total_pos H1 H2 H3
#1: 1 1 1 4 NA NA NA
#2: 1 2 1 5 NA NA NA
#3: 1 3 0 2 0 5 5
#4: 2 1 0 4 5 5 0
#5: 2 2 2 5 -5 -5 10
Your guess is correct: inside the purrr-style anonymous function (after your ~), . refers to the function argument, which is a single column, not the data frame you piped in. The solution is to simplify by removing the .$.
tmpp %>%
mutate_at(vars(matches("H")), ~ifelse(total_neg == 1, NA, .))
# ID Date total_neg total_pos H1 H2 H3
# 1: 1 1 1 4 NA NA NA
# 2: 1 2 1 5 NA NA NA
# 3: 1 3 0 2 0 5 5
# 4: 2 1 0 4 5 5 0
# 5: 2 2 2 5 -5 -5 10
If you want to modify "all variables starting with H", I'd strongly suggest using starts_with("H") rather than matches("H").
Maybe you can use starts_with() inside across(). Here the code:
library(data.table)
library(tidyverse)
tmpp <- data.table(
"ID" = c(1,1,1,2,2),
"Date" = c(1,2,3,1,2),
"total_neg" = c(1,1,0,0,2),
"total_pos" = c(4,5,2,4,5),
"H1" = c(5,4,0,5,-5),
"H2" = c(5,-10,5,5,-5),
"H3" = c(-10,6,5,0,10)
)
#Code
tmpp %>%
mutate(across(starts_with('H'),~ifelse(total_neg==1,NA,.)))
Output:
ID Date total_neg total_pos H1 H2 H3
1: 1 1 1 4 NA NA NA
2: 1 2 1 5 NA NA NA
3: 1 3 0 2 0 5 5
4: 2 1 0 4 5 5 0
5: 2 2 2 5 -5 -5 10

Assigning value to multiple columns based on if column name is in a string vector [duplicate]

This question already has answers here:
Split character column into several binary (0/1) columns
(7 answers)
Closed 2 years ago.
I'm trying to flag if a column's name appears in string vector in the same data frame.
For example, I have a dataframe that looks like the following:
df1 <- data.frame(ID = c('123', '234', '345', '456', '567')
, Types = c('A|B|C|D', 'A|B', 'D|B', 'B|D|C', 'D')
, A = NA
, B = NA
, C = NA
, D = NA)
df1
ID Types A B C D
1 123 A|B|C|D NA NA NA NA
2 234 A|B NA NA NA NA
3 345 D|B NA NA NA NA
4 456 B|D|C NA NA NA NA
5 567 D NA NA NA NA
I'm trying to put a 1 in each column where its name is in the string 'Types' so that the output dataframe looks like
df2 <- data.frame(ID = c('123', '234', '345', '456', '567')
, Types = c('A|B|C|D', 'A|B', 'D|B', 'B|D|C', 'D')
, A = c(1,1,0,0,0)
, B = c(1,1,1,1,0)
, C = c(1,0,0,1,0)
, D = c(1,0,1,1,1))
df2
ID Types A B C D
1 123 A|B|C|D 1 1 1 1
2 234 A|B 1 1 0 0
3 345 D|B 0 1 0 1
4 456 B|D|C 0 1 1 1
5 567 D 0 0 0 1
I was able to do this using this loop
for(j in 3:6)
{
for(i in 1:5)
{
df1[i,j] <- case_when(colnames(df1)[j] %like% df1[i,2] ~ 1, T ~ 0)
}
}
But the actual dataframe I'm using is significantly larger so this loop is very slow. I'm looking for help coming up with a more efficient way of doing this!
Thank you!
We can split the column and use mtabulate
library(qdapTools)
df1[-(1:2)] <- mtabulate(strsplit(df1$Types, "|", fixed = TRUE))
df1
# ID Types A B C D
#1 123 A|B|C|D 1 1 1 1
#2 234 A|B 1 1 0 0
#3 345 D|B 0 1 0 1
#4 456 B|D|C 0 1 1 1
#5 567 D 0 0 0 1
Or using cSplit_e
library(splitstackshape)
cSplit_e(df1[1:2], "Types", "|", type = 'character', fill = 0)
Here is a base R option using strsplit + table + factor
df1[-(1:2)] <- t(sapply(
strsplit(df1$Types, "\\|"),
function(x) table(factor(x, levels = names(df1)[-(1:2)]))
))
which gives
ID Types A B C D
1 123 A|B|C|D 1 1 1 1
2 234 A|B 1 1 0 0
3 345 D|B 0 1 0 1
4 456 B|D|C 0 1 1 1
5 567 D 0 0 0 1

sapply function(x) where x is subsetted argument

So, I want to generate a new vector from the information in two existing ones (numerical), one which sets the id for the participant, the other indicating the observation number. Each paticipant has been observed different times.
Now, the new vector should should state: 0 when obs_no=1; 1 when obs_no=last observation for that id; NA for cases in between.
id obs_no new_vector
1 1 0
1 2 NA
1 3 NA
1 4 NA
1 5 1
2 1 0
2 2 1
3 1 0
3 2 NA
3 3 1
I figure I could do this separatly for every id using code like this
new_vector <- c(0, rep(NA, times=length(obs_no[id==1])-2), 1)
Or I guess just using max() but it wouldn't make any difference.
But adding each participant manually is really inconvenient since I have a lot of cases. I can't figure out how to make a generic function. I tried to define a function(x) using sapply but cant get it to work since x is positioned within subsetting brackets.
Any advice would be helpful. Thanks.
ave to the rescue:
dat$newvar <- NA
dat$newvar <- with(dat,
ave(newvar, id, FUN=function(x) replace(x, c(length(x),1), c(1,0)) )
)
Or use a bit of duplicated() fun:
dat$newvar <- NA
dat$newvar[!duplicated(dat$id, fromLast=TRUE)] <- 1
dat$newvar[!duplicated(dat$id)] <- 0
Both giving:
# id obs_no new_vector newvar
#1 1 1 0 0
#2 1 2 NA NA
#3 1 3 NA NA
#4 1 4 NA NA
#5 1 5 1 1
#6 2 1 0 0
#7 2 2 1 1
#8 3 1 0 0
#9 3 2 NA NA
#10 3 3 1 1
You can also do this with dplyr
str <- "
id obs_no new_vector
1 1 0
1 2 NA
1 3 NA
1 4 NA
1 5 1
2 1 0
2 2 1
3 1 0
3 2 NA
3 3 1
"
dt <- read.table(textConnection(str), header = T)
library(dplyr)
dt %>% group_by(id) %>%
mutate(newvar = if_else(obs_no==1,0L,if_else(obs_no==max(obs_no),1L,as.integer(NA))))
We can use data.table
library(data.table)
i1 <- setDT(df1)[, .I[seq_len(.N) %in% c(1, .N)], id]$V1
df1[i1, newvar := c(0, 1)]
df1
# id obs_no new_vector newvar
# 1: 1 1 0 0
# 2: 1 2 NA NA
# 3: 1 3 NA NA
# 4: 1 4 NA NA
# 5: 1 5 1 1
# 6: 2 1 0 0
# 7: 2 2 1 1
# 8: 3 1 0 0
# 9: 3 2 NA NA
#10: 3 3 1 1
Use split:
result = lapply(split(obs_no, id), function (x) c(0, rep(NA, length(x) - 2), 1))
This gives you a list of vectors. You can paste them back together like this:
do.call(c, result)

Resources