How do I efficiently recode groups of dummies conditional on one dummy? - r

I'm trying to recode several dummy variables at once but am struggling to come up with a functioning vectorized solution (alternatively a for loop).
reprex:
library(tidyverse)
library(magrittr)
library(dummies)
library(janitor)
df_raw <- data.frame(
species = as.factor(c("cat", "dog", NA, "dog", "dog")),
weight = rnorm(5, mean = 5, sd = 1),
sex = as.factor(c("m", NA, "f", "f", "m"))
)
df_raw
species weight sex
1 cat 3.025896 m
2 dog 3.223064 <NA>
3 <NA> 5.230367 f
4 dog 4.231511 f
5 dog 5.819032 m
I split the factor variables (species and sex) into dummies but the NA get their own indicators (species_na and sex_na)
df_dummy <- dummies::dummy.data.frame(df_raw,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE) %>%
janitor::clean_names()
species_cat species_dog species_na weight sex_f sex_m sex_na
1 1 0 0 3.025896 0 1 0
2 0 1 0 3.223064 0 0 1
3 0 0 1 5.230367 1 0 0
4 0 1 0 4.231511 1 0 0
5 0 1 0 5.819032 0 1 0
My problem: how do I efficiently recode all of the factor dummies ("indexed" by the prefix, e.g. species_) to NA conditional on value of the _na dummy in the respective group of dummies? In other words, I need to mutate all dummies with the prefix species_ as NA whenever the species_na == 1 etc.
I have come up with the solution below, but I haven't been able to generalize the last step to the entire dataset
factor_vars <- dplyr::select_if(df_raw, is.factor) %>% colnames()
na_labs <- paste(factor_vars,
"na",
sep = "_")
df_dummy <- df_dummy %>%
dplyr::mutate(across(all_of(na_labs),
.fns = list(var = ~ . == 1),
.names = "{fn}_{col}" ))
# --- trial run for one variable only
test <- df_dummy %>%
mutate(species_cat = ifelse(var_species_na == TRUE,
NA,
species_cat))
Any help is appreciated!

How about this?
df_dummy <- df_dummy %>%
mutate(across(c(starts_with("species")), ~ factor(ifelse(species_na == 1, NA, .)))) %>%
mutate(across(c(starts_with("sex")), ~ factor(ifelse(sex_na == 1, NA, .))))
df_dummy
species_cat species_dog species_na weight sex_f sex_m sex_na
1 1 0 0 4.879161 0 1 0
2 0 1 0 5.960176 <NA> <NA> <NA>
3 <NA> <NA> <NA> 5.189566 1 0 0
4 0 1 0 5.165760 1 0 0
5 0 1 0 5.952365 0 1 0

You can try -
library(dplyr)
library(purrr)
df_dummy <- dummies::dummy.data.frame(df_raw,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE) %>%
janitor::clean_names()
factor_vars <- dplyr::select_if(df_raw, is.factor) %>% colnames()
na_labs <- paste(factor_vars,
"na",
sep = "_")
map_dfc(factor_vars, ~df_dummy %>%
select(contains(.x)) %>%
mutate(across(.fns = ~ifelse(.data[[paste0(.x, '_na')]] == 1, NA, .))))
# species_cat species_dog species_na sex_f sex_m sex_na
#1 1 0 0 0 1 0
#2 0 1 0 NA NA NA
#3 NA NA NA 1 0 0
#4 0 1 0 1 0 0
#5 0 1 0 0 1 0

I have a package on github {dplyover} which can create dummy variables in an across-like manner. Below we select all factor variables with where(is.factor) and apply to each column dist_value which is a wrapper around unique which returns all non-NA values. The function in .fns takes each selected column as .x and applies to it each of the unique values from dist_values as .y.
library(dplyr)
library(dplyover) # https://github.com/TimTeaFan/dplyover
df_raw %>%
mutate(crossover(where(is.factor),
dist_values,
.fns = ~ if_else(.y == .x, 1, 0)))
#> species weight sex species_cat species_dog sex_f sex_m
#> 1 cat 5.281178 m 1 0 0 1
#> 2 dog 4.343656 <NA> 0 1 NA NA
#> 3 <NA> 4.555380 f NA NA 1 0
#> 4 dog 4.990039 f 0 1 1 0
#> 5 dog 4.988497 m 0 1 0 1
Created on 2021-09-13 by the reprex package (v2.0.1)

Related

Construct a variable name using a string and another variable in the same df and then update its value in R

I am trying to update variables in a df selected using a constructed name with a string and another variable in the same data frame.
Say my df is:
df
y index
1 4 1
2 8 5
3 4 3
4 6 2
to which I add five variables m.1 through m.5:
> df
y index m.1 m.2 m.3 m.4 m.5
1 4 1 0 0 0 0 0
2 8 5 0 0 0 0 0
3 4 3 0 0 0 0 0
4 6 2 0 0 0 0 0
using something like this:
createvars <- function(df, n) { mutate(df, "m.{n}" := 0) }
for(i in 1:max(df$index)) {df <- createvars(df, n=i)}
I want to update the variables m.1 through m.5 using something like this m.{n} := index based on conditions that look something like this {n} == index.
The result then should look like:
> df
y index m.1 m.2 m.3 m.4 m.5
1 4 1 1 0 0 0 0
2 8 5 0 0 0 0 5
3 4 3 0 0 3 0 0
4 6 2 0 0 0 4 0
Note, in the actual sample/application:
the number of variables m. will depend on the sample and can be several hundred,
the values in the variables m. will be functions of index and other variables in the df,
the df will ultimately be used in lm(y~) and this might not be the right way to proceed.
Any suggestion how to accomplish this?
Thanks tons for any suggestion!!!!
One dplyr and tibble solution could be:
df %>%
add_column(!!!setNames(rep(0, 5), paste0("m", 1:5))) %>%
mutate(across(starts_with("m"), ~ +(paste0("m", index) == cur_column()) * index))
y index m1 m2 m3 m4 m5
1 4 1 1 0 0 0 0
2 8 5 0 0 0 0 5
3 4 3 0 0 3 0 0
4 6 2 0 2 0 0 0
It is easier with row/column indexing in base R
# // create the 'm.' columns with 0 values
nm1 <- paste0('m.', 1:5)
df[nm1] <- 0
# // assign the elements that corresponds to row/column index with index
df[nm1][cbind(seq_len(nrow(df)), df$index)] <- df$index
-output
df
# y index m.1 m.2 m.3 m.4 m.5
#1 4 1 1 0 0 0 0
#2 8 5 0 0 0 0 5
#3 4 3 0 0 3 0 0
#4 6 2 0 2 0 0 0
Or another option is model.matrix from base R
df[nm1] <- model.matrix(~ factor(df$index, levels = 1:5) - 1) * df$index
Or modify the function createvars and use a for loop
library(stringr)
createvars <- function(data, n) {
data %>%
mutate(!! str_c('m.', n) := case_when(index == n ~ index, TRUE ~ 0L ))
}
for(i in seq_len(max(df$index))) df <- createvars(df, i)
Or another option with rowwise and unnest
library(tidyr)
library(dplyr)
mx <- max(df$index)
df %>%
rowwise %>%
mutate(new = list(replace(numeric(mx), index, index))) %>%
ungroup %>%
unnest_wider(c(new)) %>%
rename_at(vars(starts_with("..")), ~ str_c('m.', seq_along(.)))
data
df <- structure(list(y = c(4L, 8L, 4L, 6L), index = c(1L, 5L, 3L, 2L
)), class = "data.frame", row.names = c("1", "2", "3", "4"))
You can get the data in wide format after creating a dummy column.
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number(),
col = paste0('m', index),
index1 = index) %>%
complete(col = paste0('m', 1:max(index))) %>%
pivot_wider(names_from = col, values_from = index1, values_fill = 0) %>%
filter(!is.na(row)) %>%
arrange(row) %>%
select(-row)
# y index m1 m2 m3 m4 m5
# <int> <int> <int> <int> <int> <int> <int>
#1 4 1 1 0 0 0 0
#2 8 5 0 0 0 0 5
#3 4 3 0 0 3 0 0
#4 6 2 0 2 0 0 0

Replace dates of many columns with 1 and NA with 0

There are many columns here, and I need to replace the dates with 1 and NA with 0. I would like a dplyr solution. thank you.
df <- data.frame(
id = c(1,2,3),
diabetes = c("12-12-2007",NA,"2-12-2018"),
lipids = c(NA,NA,"12-12-2015"),
stringsAsFactors = FALSE
)
df %>% mutate(across(-id, ~ifelse(is.na(.), 0, 1)))
id diabetes lipids
1 1 1 0
2 2 0 0
3 3 1 1
You can do :
df[-1] <- +(!is.na(df[-1]))
df
# id diabetes lipids
#1 1 1 0
#2 2 0 0
#3 3 1 1

R: adapt mutate call from handling three binary variables to n binary variables

I've got a dataframe with 3 binary variables that relate to time period 1 and three corresponding variables that relate to time 2.
df <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,0,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1))
df
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2
1 a 1 1 0 1 1 0
2 b 0 1 0 0 0 0
3 c 0 1 1 0 0 1
4 d 0 0 1 0 0 0
5 e NA NA 0 NA NA 1
I would like to to know if an observation has a 1 for a given item during period 1 but not during period 2. Moreover, I would like to know if an observation has any instance in which an item is 1 during period 1 and not period 2.
So the ideal output would look like
df2 <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,1,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1), "item_1_check" = c(1,1,1,1,1), "item_2_check" = c(1,0,0,1,1), "item_3_check" = c(1,1,1,0,1), item_check = c(1,0,0,0,1))
df2
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 item_1_check item_2_check item_3_check item_check
1 a 1 1 0 1 1 0 1 1 1 1
2 b 0 1 0 0 0 0 1 0 1 0
3 c 0 1 1 0 0 1 1 0 1 0
4 d 0 0 1 0 0 0 1 1 0 0
5 e NA NA 0 NA NA 1 1 1 1 1
So far I've tried
library(tidyverse)
df2 <- df %>%
mutate(across(ends_with('time_2'), replace_na, 0)) %>%
mutate(across(ends_with('time_1'), replace_na, 0)) %>%
mutate(item_1_check = if_else(item_1_time_1 == 1 & item_1_time_2 == 0, 0, 1),
item_2_check = if_else(item_2_time_1 == 1 & item_2_time_2 == 0, 0, 1),
item_3_check = if_else(item_3_time_1 == 1 & item_3_time_2 == 0, 0, 1)) %>%
mutate(item_check = pmin(item_1_check, item_2_check, item_3_check))
I would like to generalize the above mutate calls so that they can handle n many items rather than just 3. Is there a way that I can use ends_with('check') for the final mutate? The variable names don't vary but for the item number and time period.
An option would be to reshape to 'long' format and do this once
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -user, names_to = c('group', '.value'),
names_sep="_(?=time)") %>%
mutate(across(starts_with('time'), replace_na, 0)) %>%
group_by(group) %>%
transmute(user, check = !(time_1 & !time_2)) %>%
ungroup %>%
group_by(user) %>%
summarise(check = min(check), .groups = 'drop') %>%
right_join(df, .) %>%
select(names(df), check)
# user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 check
#1 a 1 1 0 1 1 0 1
#2 b 0 1 0 0 0 0 0
#3 c 0 1 1 0 0 1 0
#4 d 0 0 0 0 0 0 1
#5 e NA NA 0 NA NA 1 1
Or using base R
df$check <- +( Reduce(`&`, lapply(split.default(replace(df[-1],
is.na(df[-1]), 0), sub("time_\\d+", "", names(df)[-1])),
function(x) !(x[[1]] & !x[[2]]))))

Turn row values from multiple columns into column names in R?

I have a data frame that looks as follows:
state1 state1_pp state2 state2_pp state3 state3_pp
<chr> <chr> <chr> <chr> <chr> <chr>
1 0 0.995614 F 0.004386 NA 0
2 0 1 NA 0 NA 0
3 0 1 NA 0 NA 0
I want the values from each of the rows to be the column names the numeric values to be the row values:
0 F NA
<chr> <chr> <chr>
1 0.995614 0.004386 0
2 1 0 0
3 1 0 0
How do I do this in R?
Or a more complex scenario:
state1 state1_pp state2 state2_pp state3 state3_pp
1 0 0.995614 F 0.004386 NA 0
2 A 1 B 0 C 0
3 D 0.7 B 0.3 NA 0
This is what I want:
0 A D F B C NA
1 0.995614 0 0 0.004386 0 0 0
2 0 1 0 0 0 0 0
3 0 0 0.7 0 0.3 0 0
First a warning, having column names that are numeric (like 1) or are reserved R keywords (like NA) can cause you all sorts of errors. But if you must do it, I suggest the following:
library(dplyr)
# extract title row
headers <- df %>%
head(1) %>%
select(state1, state2, state3) %>%
unlist(use.names = FALSE) %>%
as.character()
# replace NA with "NA"
headers[is.na(headers)] = "NA"
# drop columns that are not wanted
new_df <- df %>%
select(-state1, -state2, -state3)
# replace column names
colnames(new_df) <- headers
In order to refer to your new columns you will probably need to use backticks: `
So with your new column names 0, F and NA you can call df$F but you can not call df$NA or df$1. Instead you will have to call df$`1` and df$`NA`.
Here's an attempt using dplyr and tidyr :
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
mutate_all(as.character) %>%
pivot_longer(cols = -row) %>%
mutate(name = sub('\\d+', '', name)) %>%
group_by(name, row) %>%
mutate(row1 = row_number()) %>%
pivot_wider() %>%
group_by(state, row) %>%
mutate(row1 = row_number()) %>%
pivot_wider(names_from = state, values_from = state_pp,
values_fill = list(state_pp = 0)) %>%
ungroup() %>%
select(-row, -row1)
# A tibble: 3 x 7
# `0` F `NA` A B C D
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 0.995614 0.004386 0 0 0 0 0
#2 0 0 0 1 0 0 0
#3 0 0 0 0 0.3 0 0.7

Use mutate_at() to create multiple binary variables from the values of a single variable

I have some variables that contain the following support values {a, b, c, ... k} and I wanted to create multiple binary variables for each response. For example, var_a would be equivalent to as.numeric(variable name very long== "a"), var_b would be equivalent to as.numeric(variable name very long== "b") and so on. However, in some of the variables, they don't go neatly from a:k. Some might have skipped a letter or two.
I know how to use mutate_at when I have multiple variables that I want to change, but what if I only have one variable from which I want to create multiple variables all at once?
What I have been doing so far is this:
df <- df %>% mutate(var_a = as.numeric(`variable name very long` == "a"),
var_b = as.numeric(`variable name very long` == "b"),
...)
Except of course there are more than two variables that I want to create. Is there an easier way to do this? And I also use mutate as a way to shorten the variable name. I've also tried creating a function that might be able to do this for whatever variable and value I want it to be since I have to do this often, but I wasn't able to get it to work:
varname <- function(newvar, var, value){
df <- df %>% mutate(newvar = as.numeric(var == "value"))
}
varname("var_a", "`variable name very long`", "a")
Any suggestions are deeply appreciated. Thank you!
We could use map2 to loop over the unique elements in the column, along with the vector of new column names, transmute to create the column, and bind the output with the original data
library(dplyr)
library(purrr)
library(stringr)
un1 <- sort(as.character(unique(df[["variable name very long"]])))
un2 <- str_c('var_', un1)
map2_dfc(un1, un2, ~ df %>%
transmute(!! .y := +(`variable name very long` == .x))) %>%
bind_cols(df, .)
# A tibble: 20 x 7
# `variable name very long` val var_a var_b var_c var_d var_e
# * <chr> <dbl> <int> <int> <int> <int> <int>
# 1 c -0.710 0 0 1 0 0
# 2 b -1.04 0 1 0 0 0
# 3 c -0.798 0 0 1 0 0
# 4 e 0.319 0 0 0 0 1
# 5 b 1.87 0 1 0 0 0
# 6 b -0.317 0 1 0 0 0
# 7 a -0.773 1 0 0 0 0
# 8 d -1.44 0 0 0 1 0
# 9 a -0.348 1 0 0 0 0
#10 a -0.421 1 0 0 0 0
#11 e 1.06 0 0 0 0 1
#12 e 0.528 0 0 0 0 1
#13 a 3.13 1 0 0 0 0
#14 e -0.546 0 0 0 0 1
#15 e -1.05 0 0 0 0 1
#16 d -0.687 0 0 0 1 0
#17 e -1.13 0 0 0 0 1
#18 b -0.489 0 1 0 0 0
#19 a 1.85 1 0 0 0 0
#20 d -0.0376 0 0 0 1 0
Or another option is pivot_wider
library(tidyr)
df %>%
mutate(rn = row_number(), n = 1,
newcol = str_c('var_', `variable name very long`)) %>%
pivot_wider(names_from = newcol, values_from = n, values_fill = list(n = 0))
Or in base R with model.matrix
cbind(df, model.matrix(~ `variable name very long` -1, df))
data
set.seed(24)
df <- tibble(`variable name very long` = sample(letters[1:5],
20, replace = TRUE), val = rnorm(20))

Resources