Add multiple columns with dplyr and fill cells based on condition - r

I am trying to:
1) add multiple columns that correspond to existing columns (e.g., a1 exists and add a1_yes).
2) Next, if a given cell contains 1:3, put 1 in a#_yes column, otherwise, put 0.
I can easily to this with base R but I'm trying to also make it work with dplyr.
My data:
df <- data.frame(a1 = c(1, 2, 0, NA, NA),
a2 = c(NA, 1, 2, 3, 3))
With base R:
df[paste0("a", 1:2, "_yes")] <- NA # add columns
for(c in 1:2) {
for(r in 1:nrow(df)) {
ifelse(df[r,c] %in% c(1,2,3), df[r,c+2] <- 1,df[r,c+2] <- 0)
}
}
> df
a1 a2 a1_yes a2_yes
1 1 NA 1 0
2 2 1 1 1
3 0 2 0 1
4 NA 3 0 1
5 NA 3 0 1
Thank you

Here is an option, assuming you want to do this to all columns of your dataframe
library(dplyr)
df %>%
mutate_all(., list('yes' = ~ifelse(.x %in% c(1:3), 1, 0)))
# a1 a2 a1_yes a2_yes
#1 1 NA 1 0
#2 2 1 1 1
#3 0 2 0 1
#4 NA 3 0 1
#5 NA 3 0 1
Edits
As #Akrun mentioned, you can do this without ifelse using as.integer or +
df %>%
mutate_all(., list('yes' = ~as.integer(.x %in% 1:3)))
You can also use mutate_at to select specific vars
df %>%
mutate_at(vars(a1, a2), list('yes' = ~as.integer(.x %in% 1:3)))

This will work without editing no matter how many columns you have if they are all in this format
df %>%
mutate_all(., function(x) ifelse(x == 0 | is.na(x), 0, 1)) %>%
rename_all(., function(x) paste0(x, "_yes")) %>%
bind_cols(df, .)

Here's a dplyr solution:
library(dplyr)
df <- data.frame(a1 = c(1, 2, 0, NA, NA),
a2 = c(NA, 1, 2, 3, 3))
df2 <- df %>%
mutate(a1_yes = ifelse(a1 == 0 | is.na(a1), 0, 1),
a2_yes = ifelse(a2 == 0 | is.na(a2), 0, 1))
Instead of putting the conditions so that the new columns' values are 1, I put the conditions so that they're equal to zero.

Here is a solution
df <- data.frame( a1 = c(1,2,0,NA,NA),
a2 = c(NA,1,2,3,3))
check_values <- c(1,2,3)
df %>% mutate(a1_yes = ifelse(a1 %in% check_values,1,0),
a2_yes =ifelse(a2 %in% check_values,1,0))

Related

Calculate row sums by variable names

what's the easiest way to calculate row-wise sums? For example if I wanted to calculate the sum of all variables with "txt_"? (see example below)
df <- data.frame(var1 = c(1, 2, 3),
txt_1 = c(1, 1, 0),
txt_2 = c(1, 0, 0),
txt_3 = c(1, 0, 0))
base R
We can first use grepl to find the column names that start with txt_, then use rowSums on the subset.
rowSums(df[, grepl("txt_", names(df))])
[1] 3 1 0
If you want to bind it back to the original dataframe, then we can bind the output to the original dataframe.
cbind(df, sums = rowSums(df[, grepl("txt_", names(df))]))
var1 txt_1 txt_2 txt_3 sums
1 1 1 1 1 3
2 2 1 0 0 1
3 3 0 0 0 0
Tidyverse
library(tidyverse)
df %>%
mutate(sum = rowSums(across(starts_with("txt_"))))
var1 txt_1 txt_2 txt_3 sum
1 1 1 1 1 3
2 2 1 0 0 1
3 3 0 0 0 0
Or if you want just the vector, then we can use pull:
df %>%
mutate(sum = rowSums(across(starts_with("txt_")))) %>%
pull(sum)
[1] 3 1 0
Data Table
Here is a data.table option as well:
library(data.table)
dt <- as.data.table(df)
dt[ ,sum := rowSums(.SD), .SDcols = grep("txt_", names(dt))]
dt[["sum"]]
# [1] 3 1 0
Another dplyr option:
df %>%
rowwise() %>%
mutate(sum = sum(c_across(starts_with("txt"))))

If statement with two conditions and NA

I am looking to use a conditional statement to access date rows which are before 0021-01-11 and have NA value in a specific column (People_vaccinated for example). For those rows I wanted to impute with zero.
I want to use an IF statement with (condition1 AND condition 2).
Condition1 can be df$People_vaccinated == NA and condition2 can be df$date < 'given date'
Maybe this will help -
df <- data.frame(Date = c('0021-01-07', '0021-01-08','0021-01-11', '0021-01-12'),
a = c(2, NA, 3, NA),
b = c(1, NA, 2, 3))
ind <- match('0021-01-11', df$Date)
df$a[1:ind][is.na(df$a[1:ind])] <- 0
df
# Date a b
#1 0021-01-07 2 1
#2 0021-01-08 0 NA
#3 0021-01-11 3 2
#4 0021-01-12 NA 3
Or using dplyr -
library(dplyr)
df <- df %>%
mutate(a = replace(a,
row_number() <= match('0021-01-11', Date) & is.na(a), 0))
df

Subset a new dataframe with binary columns

I would like to identify binary columns in a data.frame. And make a new df on based that condition.
For example, this table
my.table <-read.table(text="a,b,c
0,2,0
0.25,1,1
1,0,0", header=TRUE, as.is=TRUE,sep = ",")
Maybe you can keep columns that have only 0 and 1 value.
Filter(function(x) all(x %in% c(0, 1)), my.table)
# c
#1 0
#2 1
#3 0
Few other variations to do the same thing :
library(dplyr)
library(purrr)
#2
my.table[colSums(my.table == 0 | my.table == 1) == nrow(my.table)]
#3
my.table %>% select(where(~all(. %in% c(0, 1))))
#4
keep(my.table, ~all(. %in% c(0, 1)))
We can use base R
my.table[colSums(sapply(my.table, `%in%`, c(0, 1))) == nrow(my.table)]
# c
#1 0
#2 1
#3 0

Creating a function to remove columns with different names from a list of dataframes

I have many dataframes that contain the same data, except for a few column differences between them that I want to remove. Here's something similar to what I have:
df1 <- data.frame(X = c(1, 2, 3, 4, 5),
var1 = c('a', 'b', 'c', 'd', 'e'),
var2 = c(1, 1, 0, 0, 1))
df2 <- data.frame(X..x = c(1, 2, 3, 4, 5),
X..y = c(1, 2, 3, 4, 5),
var1 = c('f', 'g', 'h', 'i', 'j'),
var2 = c(0, 1, 0, 1, 1))
df_list <- list(df1=df1,df2=df2)
I am trying to create a function to remove the X, X..x, and X..y columns from each of the dataframes. Here's what I've tried with the given error:
remove_col <- function(df){
df = subset(df, select = -c(X, X..x, X..y))
return(df)
}
df_list <- lapply(df_list, remove_col)
# Error in eval(substitute(select), nl, parent.frame()) :
# object 'X..x' not found
I'm running into problems because not all dataframes contain X, and similarly not all dataframes contain X..x and X..y. How can I update the function so that it can be applied to all dataframes in the list and successfully remove its given columns?
Using R version 3.5.1, Mac OS X 10.13.6
You can try:
#Function
remove_col <- function(df,name){
vec <- which(names(df) %in% name)
df = df[,-vec]
return(df)
}
df_list <- lapply(df_list, remove_col,name=c('X', 'X..x', 'X..y'))
$df1
var1 var2
1 a 1
2 b 1
3 c 0
4 d 0
5 e 1
$df2
var1 var2
1 f 0
2 g 1
3 h 0
4 i 1
5 j 1
if you want to keep only the columns with "var"
lapply(df_list, function(x) x[grepl("var",colnames(x))])
or if you really just want those removed explecitly
lapply(df_list, function(x) x[!grepl("^X$|^X\\.\\.x$|^X\\.\\.y$",colnames(x))])
$df1
var1 var2
1 a 1
2 b 1
3 c 0
4 d 0
5 e 1
$df2
var1 var2
1 f 0
2 g 1
3 h 0
4 i 1
5 j 1
Instead of checking each list element for the same column names, it can be automated if we can extract the intersecting column names across the list. Loop over the list, get the column names, find the intersecting elements with Reduce and use that to subset the columns
nm1 <- Reduce(intersect, lapply(df_list, names))
lapply(df_list, `[`, nm1)
#$df1
# var1 var2
#1 a 1
#2 b 1
#3 c 0
#4 d 0
#5 e 1
#$df2
# var1 var2
#1 f 0
#2 g 1
#3 h 0
#4 i 1
#5 j 1
Or with tidyverse
library(dplyr)
library(purrr)
map(df_list, names) %>%
reduce(intersect) %>%
map(df_list, select, .)

Variable names as Input in an R Function

I have a dataframe with several numeric variables along with factors. I wish to run over the numeric variables and replace the negative values to missing. I couldn't do that.
My alternative idea was to write a function that gets a dataframe and a variable, and does it. It didn't work either.
My code is:
NegativeToMissing = function(df,var)
{
df$var[df$var < 0] = NA
}
Error in $<-.data.frame(`*tmp*`, "var", value = logical(0)) : replacement has 0 rows, data has 40
what am I doing wrong ?
Thank you.
Here is an example with some dummy data.
df1 <- data.frame(col1 = c(-1, 1, 2, 0, -3),
col2 = 1:5,
col3 = LETTERS[1:5])
df1
# col1 col2 col3
#1 -1 1 A
#2 1 2 B
#3 2 3 C
#4 0 4 D
#5 -3 5 E
Now find columns that are numeric
numeric_cols <- sapply(df1, is.numeric)
And replace negative values
df1[numeric_cols] <- lapply(df1[numeric_cols], function(x) replace(x, x < 0 , NA))
df1
# col1 col2 col3
#1 NA 1 A
#2 1 2 B
#3 2 3 C
#4 0 4 D
#5 NA 5 E
You could also do
df1[df1 < 0] <- NA
With tidyverse, we can make use of mutate_if
library(tidyverse)
df1 %>%
mutate_if(is.numeric, funs(replace(., . < 0, NA)))
If you still want to change only one selected variable a solution withdplyr would be to use non-standard evaluation:
library(dplyr)
NegativeToMissing <- function(df, var) {
quo_var = quo_name(var)
df %>%
mutate(!!quo_var := ifelse(!!var < 0, NA, !!var))
}
NegativeToMissing(data, var=quo(val1)) # use quo() function without ""
# val1 val2
# 1 1 1
# 2 NA 2
# 3 2 3
Data used:
data <- data.frame(val1 = c(1, -1, 2),
val2 = 1:3)
data
# val1 val2
# 1 1 1
# 2 -1 2
# 3 2 3

Resources