Nested if statements inside case_when - r

I'm attempting to mutate a large data set, and am wondering if there is a better way to run this code:
library(dplyr)
df <- data.frame(id = c('person 1', 'person 2'), a1 = c(0, 1), a2 = c(1, 0), a3 = c(0, 0), b1 = c(0, 1), b3 = c(1, 0))
new_function = function(name){
df %>% mutate(
!!name := case_when(
if(any(names(df) == paste0(name,'_1'))){ !!sym(paste0(name,'1')) == 1 ~ 1 },
if(any(names(df) == paste0(name,'_2'))){ !!sym(paste0(name,'2')) == 1 ~ 2 },
if(any(names(df) == paste0(name,'_3'))){ !!sym(paste0(name,'3')) == 1 ~ 3 },
TRUE ~ 0)
)
}
col_names = c('a', 'b')
col_names %>%
map(new_function)
To add two new columns:
a = c(2,1)
b = c(3,1)
The problem is the vector col_names has over a hundred entries and I'm not sure which columns are missing. It seems very inefficient to scan the entire data frame for each column name in each statement of case_when.
I tried to use possibly() or safely() (without the if statements) to ignore the error and keep running through the code, but it gave the same error as before.
*Note each person can only have one '1' for each letter, so only one statement of case_when can evaluate to TRUE.

Since, we know that each person can have only one 1 in each col_names, we can use that fact to simplify our logic. We can select column which starts with col_names get the column number which has 1 in it using max.col and return the value from the column name by removing all the non-numeric data from it.
library(dplyr)
new_function <- function(x) {
temp <- df %>% select(starts_with(x))
as.integer(gsub("\\D", "", names(temp)[max.col(temp)]))
}
bind_cols(df, purrr::map_dfc(col_names, new_function))
# id a1 a2 a3 b1 b3 V1 V2
#1 person 1 0 1 0 0 1 2 3
#2 person 2 1 0 0 1 0 1 1

Related

How to check for the existence of a certain value in a set of variables only when there is no NA?

I have a dataframe with over hundreds of variables, grouped in different factors ("Happy_","Sad_", etc) and I want to create a set new variables indicating whether a participant put a rating of 4 in any of the variables in one factor. However, if any of the variable in that factor is NA, then the new variable will also output NA.
I have tried the following, but it didn't work:
library(tidyverse)
df <- data.frame(Subj = c("A", "B", "C", "D"),
Happy_1_Num = c(4,2,2,NA),
Happy_2_Num = c(4,2,2,1),
Happy_3_Num = c(1,NA,2,4),
Sad_1_Num = c(2,1,4,3),
Sad_2_Num = c(NA,1,2,4),
Sad_3_Num = c(4,2,2,1))
# Don't work
df <- df %>% mutate(Happy_Any4 = ifelse(if_any(matches("^Happy_") & matches("_Num$"), ~ is.na(.)), NA,
ifelse(if_any(matches("^Happy_") & matches("_Num$"), ~ . == 4),1,0)),
Sad_Any4 = ifelse(if_any(matches("^Sad_") & matches("_Num$"), ~ is.na(.)), NA,
ifelse(if_any(matches("^Sad_") & matches("_Num$"), ~ . == 4),1,0)))
I tried a workaround by first generating a set of variables to indicate if that factor has any NA, and after that check if participant put any rating of "4". it works; but since I have many factors, I was wondering if there is a more elegant way of doing it.
# workaround
df <- df %>% mutate(
NA_Happy = ifelse(if_any(matches("^Happy_") & matches("_Num$"), ~ is.na(.)), 1,0),
NA_Sad = ifelse(if_any(matches("^Sad_") & matches("_Num$"), ~ is.na(.)), 1,0))
df <- df %>% mutate(
Happy_Any4 = ifelse(NA_Happy == 1, NA,
ifelse(if_any(matches("^Happy_") & matches("_Num$"), ~ . == 4),1,0)),
Sad_Any4 = ifelse(NA_Sad == 1, NA,
ifelse(if_any(matches("^Sad_") & matches("_Num$"), ~ . == 4),1,0)))
Here is a base R option using split.default -
tmp <- df[-1]
cbind(df, sapply(split.default(tmp, sub('_.*', '', names(tmp))),
function(x) as.integer(rowSums(x== 4) > 0)))
# Subj Happy_1_Num Happy_2_Num Happy_3_Num Sad_1_Num Sad_2_Num Sad_3_Num Happy Sad
#1 A 4 4 1 2 NA 4 1 NA
#2 B 2 2 NA 1 1 2 NA 0
#3 C 2 2 2 4 2 2 0 1
#4 D NA 1 4 3 4 1 NA 1
sub would keep only either "Happy" or "Sad" part of the names, split.default splits the data based on that and use sapply to calculate if any value of 4 is present in a row.
If you can afford to write each and every factor manually you can do -
library(dplyr)
df %>%
mutate(Happy = as.integer(rowSums(select(., starts_with('Happy')) == 4) > 0),
Sad = as.integer(rowSums(select(., starts_with('Sad')) == 4) > 0))
here is another workaround by transposing the data.frame and an apply on colonns. I'm not sure it's more elegant but here it is ^^
tmp <- cbind(sub("^((Happy)|(Sad))(_.*_Num)$", "\\1", colnames(df)), t(df))
Happy_Any4 <- apply(tmp[tmp[,1]== "Happy", -1], 2,
function(x) ifelse(any(is.na(x)), NA, length(grep("4", x))) )
Sad_Any4 <- apply(tmp[tmp[,1]== "Sad", -1], 2,
function(x) ifelse(any(is.na(x)), NA, length(grep("4", x))) )
df <- cbind(df, Happy_Any4 = Happy_Any4, Sad_Any4 = Sad_Any4)
EDIT : Above was a strange test, but now this work with more beauty !
This is because the sum of anything where there is an NA will return NA.
df <- df %>% mutate(Happy_Any4 = apply(df[,grep("^Happy_.*_Num$", colnames(df))],
1, function(x) 1*(sum(x == 4) > 0)),
Sad_Any4 = apply(df[, grep("^Sad_.*_Num$", colnames(df))],
1, function(x) 1*(sum(x == 4) > 0)))
The apply will look every row, only on columns where we find the correct part in colnames (with grep. It then find every occurence of 4, which form a logical vector, and it's sum is the number of occurence. The presence of an NA will bring the sum to NA. I then just check if the sum is above 0 and the 1* will turn the numeric into logical.

Subset data.frame based on lag between two columns

Suppose you want to subset a data.frame where the rule for keeping rows is based
on a lag beteen rows 'a' and 'b':
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
#output
a b
1 1 0
2 0 1
3 0 1
4 1 0
5 0 1
6 0 1
Essentially, if 'a' = 1 you want to keep that row as well as the subsequent run of rows in
'b' that have a value of 1. This capture continues until the next row with a = 0 & b = 0.
I've tried using nested 'ifelse()' statements, but I am stuck incorporate logical tests based on a lag issue.
Suggestions?
This is how I would do it. There are probably options out there that require maybe 1 or 2 lines less.
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
library(dplyr)
df %>%
mutate(grp = cumsum(a==1|a+b==0)) %>%
group_by(grp) %>%
filter(any(a == 1)) %>%
ungroup() %>%
select(a, b)
A solution without dplyr. Work with a flag:
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
# create new empty df
new_df <- read.table(text = "", col.names = c("a", "b"))
a_okay = FALSE # initialize the flag
for (row_number in seq(1:nrow(df))) { # loop over each row of the original df
# if a is 1, we add the row to the new df and set the flag to TRUE
if (df[row_number, "a"] == 1) {
a_okay = TRUE
new_df[nrow(new_df) + 1, ] = c(df[row_number, "a"], df[row_number, "b"])
}
# now we consider the rows where a is not 1
else {
# if b is 1 and we are still following an a == 1: add the row
if (df[row_number, "b"] == 1 & a_okay) {
new_df[nrow(new_df) + 1, ] = c(df[row_number, "a"], df[row_number, "b"])
}
# if b is 0, we reset the flag
else {
a_okay = FALSE
}
}
}
Another base solution inspired by this post, #Wietse de Vries's answer and #Ben's comment.
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
# identify groups
df$grp <- cumsum(df$a == 1 | df$b == 0)
# subset df by groups with first element of a == 1
df <- do.call(rbind, split(df, df$grp)[by(df, df$grp, function(x) {x$a[1] == 1})])
# remove grp
df$grp <- NULL

Fill NAs with 0 if the column is numeric and empty string '' if the column is a factor using R [duplicate]

This question already has answers here:
How to replace NA values in a table for selected columns
(12 answers)
Closed 2 years ago.
I'm trying to replace all the NAs present in the column of integer type with 0 and NAs present in the column of factor type with empty string "". The code below is the one that i'm using but it doesn't seem to work
for(i in 1:ncol(credits)){
if(sapply(credits[i], class) == 'integer'){
credits[is.na(credits[,i]), i] <- 0
}
else if(sapply(credits[i], class) == 'factor'){
credits[is.na(credits[,i]), i] <- ''
}
You can use across in dplyr to replace column values by class :
library(dplyr)
df %>%
mutate(across(where(is.factor), ~replace(as.character(.), is.na(.), '')),
across(where(is.numeric), ~replace(., is.na(.), 0)))
# a b
#1 1 a
#2 2 b
#3 0 c
#4 4 d
#5 5
b column is of class "character" now, if you need it as factor, you can add factor outside replace like :
across(where(is.factor), ~factor(replace(as.character(.), is.na(.), ''))),
data
df <- data.frame(a = c(1, 2, NA, 4:5), b = c(letters[1:4], NA),
stringsAsFactors = TRUE)
Another way of achieving the same:
library(dplyr)
# Dataframe
df <- data.frame(x = c(1, 2, NA, 4:5), y = c('a',NA, 'd','e','f'),
stringsAsFactors = TRUE)
# Creating new columns
df_final<- df %>%
mutate(new_x = ifelse(is.numeric(x)==TRUE & is.na(x)==TRUE,0,x)) %>%
mutate(new_y = ifelse(is.factor(y)==TRUE & is.na(y)==TRUE,"",y))
# Printing the output
df_final

Deduplicating a data frame when the order of values may differ in R

Let's say I have a data.frame that looks like this:
df = data.frame(from=c(1, 1, 2, 1),
to=c(2, 3, 1, 4),
title=c("A", "B", "A", "A"),
stringsAsFactors=F)
df is an object that holds all of the various connections for a network graph. I also have a second data.frame, which is the simplified graph data:
df2 = data.frame(from=c(1, 1, 3),
to=c(2, 4, 1),
stringsAsFactors=F)
What I need is to pull the title values from df into df2. I can't simply dedup df because a) from and to can be in different orders, and b) title is not unique between connections. The current condition I have is:
df2$title = df$title[df2$from == df$from & df2$to == df$to]
However, this results in too few rows due to the order of from and to being reversed in row 2 of df2. If I introduce an OR condtion, then I get too many results because the connection between 1 and 2 will be matched twice.
My question, then, is how do I effectively "dedup" the title variable to append it to df2?
The expected outcome is this:
from to title
1 1 2 A
2 1 4 A
3 3 1 B
library(dplyr);
merge(mutate(df2, from1 = pmin(from, to), to1 = pmax(from, to)),
mutate(df, from1 = pmin(from, to), to1 = pmax(from, to)),
by = c("from1", "to1"), all.x = T) %>%
select(from1, to1, title) %>% unique()
# from1 to1 title
#1 1 2 A
#3 1 3 B
#4 1 4 A
Another way we can try, where edgeSort function produce unique edges if the two vertices are the same and use match function to match all equal edges.
edgeSort <- function(df) apply(df, 1, function(row) paste0(sort(row[1:2]), collapse = ", "))
df2$title <- df$title[match(edgeSort(df2), edgeSort(df))]
df2
from to title
1 1 2 A
2 1 4 A
3 3 1 B
I guess you can do it in base R by 2 merge statements:
step1 <- merge(df2, df, all.x = TRUE)
step2 <- merge(df2[is.na(step1$title),], df, all.x = TRUE, by.x = c("to", "from"), by.y = c("from", "to"))
rbind(step1[!is.na(step1$title),], step2)
from to title
1 1 2 A
2 1 4 A
3 3 1 B

Remove NAs in row, and move the cell on the right were the NA was located in R also unique values

OK, so I have a data frame in R like this
ID <- c(1, 2, 3)
c1 <- c( 1, 1, NA)
c2 <- c(NA, NA, 5)
c3 <- c(NA, NA, NA)
c4 <- c(2, NA, 5)
c5 <- c(5, 7, 3)
df <- data.frame(ID, c1, c2, c3, c4, c5)
So, this is what I'm looking for
1. Treat every row as a vector
2. Be able to remove all NAs in every row/vector
3. In a given row there can't be repeated values (expect for ID vs a number in other cell)
4. I'm looking to "cut" this row/vector. I don't need 5 values just 2.
I'm doing this for a MAP#k metric, so the order of the numbers (the one on the left is more importante than the next one) son it's important to keep the order.
This is the output that I'm looking for
ID <- c(1, 2, 3)
c1 <- c(1, 1, 5)
c2 <- c(2, 7, 3)
df2 <- data.frame(ID, c1, c2)
Thank you for your help
We loop through the rows of 'df' (using apply with MARGIN as 1), remove the NA elements (!is.na(x)) and get the unique values. Then, if the length of the elements are not the same, the output will be a list ('lst'). We use lengths to get the length of each list element, get theminof it, based on it we subset thelistelements andcbind` with the first column 'ID'.
lst <- apply(df[-1], 1, function(x) unique(x[!is.na(x)]))
dfN <- cbind(df[1], do.call(rbind,lapply(lst, function(x) x[seq(min(lengths(lst)))])))
colnames(dfN)[-1] <- paste0("c", colnames(dfN)[-1])
dfN
# ID c1 c2
#1 1 1 2
#2 2 1 7
#3 3 5 3
NOTE: If the length of unique elements are the same in each row (after removing the NA), the output will be a matrix. Just transpose the output and cbind with the first column.
Or another option is data.table which should be very efficient.
library(data.table)
dM <- melt(setDT(df), id.var="ID", na.rm=TRUE)[,
.(value = unique(value), n = seq(uniqueN(value))), ID]
dcast(dM[dM[, n1 := min(tabulate(ID))][, .I[1:.N <=n1] , ID]$V1],
ID~paste0("c", n), value.var="value")
# ID c1 c2
#1: 1 1 2
#2: 2 1 7
#3: 3 5 3
Ugly but should be efficient (chewed through 3M records in about 20secs and 300K in < 2 secs):
sel <- !is.na(df[-1])
tmp <- unique(data.frame(ID=df$ID[row(df[-1])[sel]], c=df[-1][sel]))
tmp$time <- ave(tmp$ID, tmp$ID, FUN=seq_along)
reshape(tmp[tmp$time <= 2,], idvar="ID", direction="wide", sep="")
# ID c1 c2
#1 1 1 2
#2 2 1 7
#3 3 5 3
Based on akrun data.table idea, I translated the data.table code to dplyr/tidyr (is easier for me to read, that's all). Here is the code
library(dplyr)
library(tidyr)
df_tidy <- df %>%
gather(importance, val, c1:c5) %>%
na.omit %>%
arrange(ID, importance) %>%
group_by(ID) %>%
distinct(ID, val) %>%
mutate(place = seq_len(n())) %>%
filter(place <= 2) %>%
mutate(place = paste("c", place, sep="")) %>%
select(-importance) %>%
spread(place, val)
Thank you akrun and thelatemail !

Resources