pass formula expressions in dplyr::mutate - r

Is there a way to pass a formula expression into mutate to create a new variable.
expression1 <- formula(ifelse(Var1 == 9, 0, Var1))
df <- data.frame(Var1 = sample(1:10, 10, replace = TRUE),
Var2 = sample(1:10, 10, replace = TRUE)) %>%
mutate(new_var = expression1)

If it is a string, then we can parse it
library(dplyr)
expression1 <- 'ifelse(Var1 == 9, 0, Var1)'
df %>%
mutate(new_var = eval(rlang::parse_expr(expression1)))
-output
# Var1 Var2 new_var
#1 7 7 7
#2 5 7 5
#3 5 2 5
#4 4 6 4
#5 7 9 7
#6 9 3 0
#7 9 5 0
#8 2 9 2
#9 9 2 0
#10 10 9 10
data
df <- data.frame(Var1 = sample(1:10, 10, replace = TRUE),
Var2 = sample(1:10, 10, replace = TRUE))

rlang prefers to use expressions or quosures rather than formulas. It would be better to use
expression1 <- rlang::expr(ifelse(Var1 == 9, 0, Var1))
df <- data.frame(Var1 = sample(1:10, 10, replace = TRUE),
Var2 = sample(1:10, 10, replace = TRUE)) %>%
mutate(new_var = !!expression1)
First you create the expression with the expr() function. Then you "inject" that into the dplyr expression you want to call with !!

Related

Creating a new variable adding 1 each time a condition is met in R

I am working with a survey data set in which each observation (respondents) is represented by an own row. I want to create a new (numeric) variable which counts the number of times a condition is met by other variables per row. More specifically, the dataframe contains several numerical variables (var1, var2, var3 in the example below). Each time that a value of those variables is >=3 and not NA, the new variable (desiredvar) should increase by 1. As you can see in the example, the desired variable takes the value 2 for the first row, since var1 and var3 are both >= 3.
df1 <- data.frame(var1 = c(3, NA, 2, 1),
var2 = c(0, 0, 2, 1),
var3 = c(8, NA, 5, 6),
desiredvar = c(2, 0, 1, 1))
var1 var2 var3 desiredvar
1 3 0 8 2
2 NA 0 NA 0
3 2 2 5 1
4 1 1 6 1
I am assuming that it should be relatively easy to code that with a for loop and/or apply, but I am not very experienced with R. Would appreciate any help!
Best,
Carlo
You can use rowSums with na.rm = TRUE:
df1$desiredvar <- rowSums(df1 >= 3, na.rm = TRUE)
or with apply:
df1$desiredvar <- apply(df1 >= 3, 1, sum, na.rm = T)
var1 var2 var3 desiredvar
1 3 0 8 2
2 NA 0 NA 0
3 2 2 5 1
4 1 1 6 1
In dplyr, you could use the abovementioned answers, or use rowwise and c_across:
library(dplyr)
df1 %>%
rowwise() %>%
mutate(desiredvar = sum(c_across(var1:var3) >= 3, na.rm = T))

Second Most Common Element in Each Row

I have this dataset in R:
library(stringr)
set.seed(999)
col1 = sample.int(5, 100, replace = TRUE)
col2 = sample.int(5, 100, replace = TRUE)
col3 = sample.int(5, 100, replace = TRUE)
col4 = sample.int(5, 100, replace = TRUE)
col5 = sample.int(5, 100, replace = TRUE)
col6 = sample.int(5, 100, replace = TRUE)
col7 = sample.int(5, 100, replace = TRUE)
col8 = sample.int(5, 100, replace = TRUE)
col9 = sample.int(5, 100, replace = TRUE)
col10 = sample.int(5, 100, replace = TRUE)
d = data.frame(id = 1:10, seq = c(paste(col1, collapse = ""), paste(col2, collapse = ""), paste(col3, collapse = ""), paste(col4, collapse = ""), paste(col5, collapse = ""), paste(col6, collapse = ""), paste(col7, collapse = ""), paste(col8, collapse = ""), paste(col9, collapse = ""), paste(col10, collapse = "")))
For each row, I would like to create new variables:
d$most_common: the most common element in each row
d$second_most_common: the second most common element in each row
d$third_most_common: the third most common element in each row
I tried to do this with the following function (Find the most frequent value by row):
rowMode <- function(x, ties = NULL, include.na = FALSE) {
# input checks data
if ( !(is.matrix(x) | is.data.frame(x)) ) {
stop("Your data is not a matrix or a data.frame.")
}
# input checks ties method
if ( !is.null(ties) && !(ties %in% c("random", "first", "last")) ) {
stop("Your ties method is not one of 'random', 'first' or 'last'.")
}
# set ties method to 'random' if not specified
if ( is.null(ties) ) ties <- "random"
# create row frequency table
rft <- table(c(row(x)), unlist(x), useNA = c("no","ifany")[1L + include.na])
# get the mode for each row
colnames(rft)[max.col(rft, ties.method = ties)]
}
rowMode(d[1,1])
This gave me an error:
Error in rowMode(d[1, 1]) : Your data is not a matrix or a data.frame.
Which is a bit confusing, seeing as "d" is a data.frame.
Is there an easier way to do this?
Thank you!
You can do this by splitting the long string on each character, pivoting longer, and counting instances by id and character, and taking the top 3..
Here is an approach using data.table
library(data.table)
setDT(d)
melt(d[, tstrsplit(seq,""), id], id.vars = "id")[, .N, .(id, value)][order(-N), .SD[1:3][,nth:=.I], id]
Output (first six rows of 30):
id value N nth
1: 2 2 30 1
2: 2 1 22 2
3: 2 4 19 3
4: 3 3 28 1
5: 3 2 23 2
6: 3 4 20 3
Here is a similar approach using dplyr with unnest() to make long:
d %>%
group_by(id) %>%
mutate(chars = strsplit(seq,"")) %>%
unnest(chars) %>%
count(id, chars,sort = T) %>%
slice_head(n=3)
Output:
id chars n
<int> <chr> <int>
1 1 1 24
2 1 5 20
3 1 2 19
4 2 2 30
5 2 1 22
6 2 4 19
7 3 3 28
8 3 2 23
9 3 4 20
10 4 1 26
If you need the variables "Most_common", "second_most":
You can use: mutate & str_count which counts each character in a string
library dplyr
#range
r <- 1:5 |> as.character()
d |>
group_by(id) |>
mutate(most_common = which(unique(str_count(seq, r)) == last(sort(str_count(seq, r)))),
second_most_common = first(which(str_count(seq, r) == nth(sort(str_count(seq, r)), length(r) - 1))),
third_most_common = first(which(str_count(seq, r) == nth(sort(str_count(seq, r)), length(r) - 2))))
id seq most_common second_most_com… third_most_comm…
<int> <chr> <int> <int> <int>
1 1 3451122353321532415512241532113224441251251254542314534141431523132515542431525553… 1 5 2
2 2 1432431521432121553144243252433424314222143112242423421524144222151123234314255321… 2 1 4
3 3 4232245131422525453443332555312143535325221555344453323342533222344112134311342335… 3 2 4
4 4 4252525524252335331144111244343534224454131341553141342131354215143133213214314241… 1 3 4
5 5 2223513245222513345115334422121115412343225125312335414233115453235322543311352331… 3 2 1
6 6 3244331444151221411123513334135553324122122233134315145451545423111325253225325141… 1 1 2
7 7 4353332532552141211553131123521145214552211231144155553152131124221522333222343355… 5 1 3
8 8 1432215433134223221222143432454314232514255344213444342235252213324245413213554121… 2 4 3
9 9 2335142431432434123121254343455134511124323335211514354553145531115232541551252421… 1 1 3
10 10 1552245312213342315524134513123511112311314321112334533252141242212345432435421535… 1 3 2

How to create a new column with conditional logic, based on several values found in multiple columns?

I have a data set of birth defects (test), in which each row is a case, with a different 5-way combination of defects. The first five columns of the data set (Defect_A, Defect_B, Defect_C, Defect_D, Defect_E) are the defect numbers that make up this combination.
I want to create a new column called “comments” that outputs a comment based on the following conditional logic:
If a case/row has any of the following defects (1, 2, 3, 4) in columns 1:5, comments = “conjoined”
Ifelse a case has any TWO of the following defects (5, 6, 7, 8) in columns 1:5, comments = “spina bifida”
Ifelse a case has any one of the following defects (5, 6, 7, 8) AND one of the following defects (9,10,11,12,13) in columns 1:5, comments = “heterodaxy”
Ifelse a case has any THREE of the following defects (14,15,16,17,18) in columns 1:5, comments = “vacterl”
Defect_A Defect_B Defect_C Defect_D Defect_E
case1 12 3 13 17 9
case2 20 13 6 7 3
case3 11 10 4 20 12
case4 13 7 2 18 3
case5 5 2 15 11 13
case6 8 1 15 19 4
case7 11 7 19 10 1
case8 9 14 15 11 16
case9 18 10 14 16 8
case10 19 7 8 10 2
How would I go about doing this? I’ve included sample code below.
[edit]
# Sample data set
set.seed(99)
case1 = sample(1:20, 5, replace=FALSE)
case2 = sample(1:20, 5, replace=FALSE)
case3 = sample(1:20, 5, replace=FALSE)
case4 = sample(1:20, 5, replace=FALSE)
case5 = sample(1:20, 5, replace=FALSE)
case6 = sample(1:20, 5, replace=FALSE)
case7 = sample(1:20, 5, replace=FALSE)
case8 = sample(1:20, 5, replace=FALSE)
case9 = sample(1:20, 5, replace=FALSE)
case10 = sample(1:20, 5, replace=FALSE)
test<-data.frame(rbind(case1, case2, case3, case4, case5, case6, case7, case8, case9, case10))
colnames(test)<- c("Defect_A", "Defect_B", "Defect_C", "Defect_D", "Defect_E")
test
# Conditions
any <- c(1,2,3,4) # for condition 1
any_2 <- c(5,6,7,8) # for conditions 2 and 3
any_2_plus <- c(9,10,11,12,13) # for condition 3
any_3 <- c(14,15,16,17,18) # for condition 4
With this dataframe :
# Sample data set
df = data.frame(Defect_A = sample(1:30, 10, replace=TRUE),
Defect_B = sample(1:30, 10, replace=TRUE),
Defect_C = sample(1:30, 10, replace=TRUE),
Defect_D = sample(1:30, 10, replace=TRUE),
Defect_E = sample(1:30, 10, replace=TRUE))
# Conditions
any <- c(1,2,3,4) # for condition 1
any_2 <- c(5,6,7,8) # for conditions 2 and 3
any_2_plus <- c(9,10,11,12,13) # for condition 3
any_3 <- c(14,15,16,17,18) # for condition 4
You may use several ifelse
df$comments = apply(df,1, function(x) {
ifelse(length(x[x %in% any == TRUE]) >= 1, 'conjoined', ifelse (
length(x[x %in% any_2 == TRUE]) >= 2, 'spina bifida', ifelse (
length(x[x %in% any_2 == TRUE]) >= 1 && length(x[x %in% any_2_plus == TRUE]) >= 1, 'heterodaxy', ifelse (
length(x[x %in% any_3 == TRUE]) >= 3, 'vacterl', 'NA'))))
})
Conditions to adapt if needed

column to rows but holding on to common column names

I have a data frame that looks like this:
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, gorup2_c_xx = 14, stringsAsFactors = FALSE)
group1_a_mu group1_b_sd group1_c_xx group2_a_mu group2_b_sd gorup2_c_xx
1 10 4 5 1 2 14
and I would like to transform it to this:
mu sd xx
group1 10 4 5
group2 1 2 14
how can one do that?
You could try the following (based on the data from the original post):
library(dplyr)
library(tidyr)
data.frame(group1_a = 10, group1_b = 4, group1_c = 5, group2_a=1, group2_b=2, group2_c = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
separate(key, c('group_name', 'subgroup_name'), sep = '_') %>%
spread(subgroup_name, val)
## group_name a b c
## 1 group1 10 4 5
## 2 group2 1 2 14
For the situation where there are 2 _ characters (updated post), the following approach temporarily modifies the _ character. The alternative is to use the look ahead or look behind operators in the separate regular expression (sep).
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, group2_c_xx = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
mutate(key = sub('_', '|', key)) %>% ## Temporary change of '_' to '|'
separate(key, c('group_name', 'subgroup_name'), sep = '_') %>%
spread(subgroup_name, val) %>%
mutate(group_name = sub('[|]', '_', group_name)) ## Change back to '_'
## group_name mu sd xx
## 1 group1_a 10 NA NA
## 2 group1_b NA 4 NA
## 3 group1_c NA NA 5
## 4 group2_a 1 NA NA
## 5 group2_b NA 2 NA
## 6 group2_c NA NA 14
Using the positive look behind operator will give the same results.
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, group2_c_xx = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
separate(key, c('group_name', 'subgroup_name'), sep = '(?<=[a-z])_') %>%
spread(subgroup_name, val)

Sum of two Columns of Data Frame with NA Values

I have a data frame with some NA values. I need the sum of two of the columns. If a value is NA, I need to treat it as zero.
a b c d
1 2 3 4
5 NA 7 8
Column e should be the sum of b and c:
e
5
7
I have tried a lot of things, and done two dozen searches with no luck. It seems like a simple problem. Any help would be appreciated!
dat$e <- rowSums(dat[,c("b", "c")], na.rm=TRUE)
dat
# a b c d e
# 1 1 2 3 4 5
# 2 5 NA 7 8 7
dplyr solution, taken from here:
library(dplyr)
dat %>%
rowwise() %>%
mutate(e = sum(b, c, na.rm = TRUE))
Here is another solution, with concatenated ifelse():
dat$e <- ifelse(is.na(dat$b) & is.na(dat$c), dat$e <-0, ifelse(is.na(dat$b), dat$e <- 0 + dat$c, dat$b + dat$c))
# a b c d e
#1 1 2 3 4 5
#2 5 NA 7 8 7
Edit, here is another solution that uses with as suggested by #kasterma in the comments, this is much more readable and straightforward:
dat$e <- with(dat, ifelse(is.na(b) & is.na(c ), 0, ifelse(is.na(b), 0 + c, b + c)))
if you want to keep NA if both columns has it you can use:
Data, sample:
dt <- data.table(x = sample(c(NA, 1, 2, 3), 100, replace = T), y = sample(c(NA, 1, 2, 3), 100, replace = T))
Solution:
dt[, z := ifelse(is.na(x) & is.na(y), NA_real_, rowSums(.SD, na.rm = T)), .SDcols = c("x", "y")]
(the data.table way)
I hope that it may help you
Some cases you have a few columns that are not numeric. This approach will serve you both.
Note that: c_across() for dplyr version 1.0.0 and later
df <- data.frame(
TEXT = c("text1", "text2"), a = c(1,5), b = c(2, NA), c = c(3,7), d = c(4,8))
df2 <- df %>%
rowwise() %>%
mutate(e = sum(c_across(a:d), na.rm = TRUE))
# A tibble: 2 x 6
# Rowwise:
# TEXT a b c d e
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 text1 1 2 3 4 10
# 2 text2 5 NA 7 8 20

Resources