write function with dplyr - r

I have this function
var_sup <- function(var1,var2)
{
df$RD <- ifelse(df[var1]>df[var2],1,0)
df$RD <- as.numeric(df$RD)
return(df)
}
I want to write with dplyr to use it : like that
var_sup(num,num2) without "" !
compare_sup <- function (var1,var2) {
# capture the argument without evaluating it
var1 <- quo_name(enquo(var1))
var2 <- quo_name(enquo(var2))
# construct the expression
df %>%
mutate(RD = ifelse(!!var1 > !!var2 ,1,0))
}
I tried that but I have an error
thank you

The following works for me:
compare_sup <- function (var1,var2) {
require(tidyverse)
# capture the argument without evaluating it
var1 <- enquo(var1)
var2 <- enquo(var2)
# construct the expression
mtcars %>%
mutate(RD = ifelse(!!var1 > !!var2, 1, 0))
}
compare_sup(drat, wt) %>% head
# mpg cyl disp hp drat wt qsec vs am gear carb RD
#1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 1
#2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 1
#3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 1
#4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 0
#5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 0
#6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 0
I basically removed the quo_name() from the function (and used mtcars as data set).

Related

How do I create a new column indicating whether certain other columns contain a given value?

I'd like a new column in a data.frame to indicate whether, for each row, the number "2" appears in certain other columns. Here's a simple version that works for a small data.frame:
df <- data.frame(mycol.1 = 1:5, mycol.2= 5:1, other.col = -2:2)
df$mycols.contain.two <- df$mycol.1 ==2 | df$mycol.2 ==2
df
mycol.1 mycol.2 other.col mycols.contain.two
1 1 5 -2 FALSE
2 2 4 -1 TRUE
3 3 3 0 FALSE
4 4 2 1 TRUE
5 5 1 2 FALSE
Now suppose the data.frame has 50 columns, and I want the new column to indicate whether any of the columns beginning with "mycol" contain a "2" in each row, without having to use the "|" symbol 49 times. I assume there's an elegant dplyr answer using starts_with(), but I can't figure out the syntax.
You could do:
df <- data.frame(mycol.1 = 1:5, mycol.2= 5:1, other.col = -2:2)
df$TYPE <- ifelse(rowSums(ifelse(sapply(df, function (x){x == 2}), 1, 0)) > 0 , "TRUE", "FALSE")
# > df
# mycol.1 mycol.2 other.col TYPE
# 1 1 5 -2 FALSE
# 2 2 4 -1 TRUE
# 3 3 3 0 FALSE
# 4 4 2 1 TRUE
# 5 5 1 2 TRUE
You can achieve it by indexing. Let's take the mtcars data.
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
After that, we can index any column. Say we are interesting in columns 8 to 11,
mtcars$new <- rowSums(mtcars[,8:11]==2)>0
gives,
mpg cyl disp hp drat wt qsec vs am gear carb new
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 FALSE
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 FALSE
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 FALSE
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 FALSE
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 TRUE
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 FALSE
>
You could use a simple apply() loop:
df <- data.frame(mycol.1 = 1:5, mycol.2= 5:1, other.col = -2:2)
df$mycols.contain.two <- apply(df, 1, function(x){any(x == 2)})
or if you want to check only the first 3 columns:
df <- data.frame(mycol.1 = 1:5, mycol.2= 5:1, other.col = -2:2)
df$mycols.contain.two <- apply(df, 1, function(x){any(x[1:3] == 2)})

mutate with case_when - multiple LHS/RHS OR evaluations

I'm not sure of the best way to ask this question.
I would like to mutate using case_when (or if_else if that works better) to examine if a value exists in any of a range of columns.
E.g. in mtcars I would like to check if any of the columns vs, am, gear or carb contained 1 or 2 and set a new variable newVar to 1 if they do. I could do the following:
mtcars %>%
mutate(newVar = case_when(vs %in% c(1, 2) | am %in% c(1, 2) | gear %in% c(1, 2) | carb %in% c(1, 2) ~ 1,
TRUE ~ 0))
Is there a prettier way to do this? I want to check across 10+ columns so it gets long. Something like:
mtcars %>%
mutate(newVar = case_when(c(vs, am, gear, carb) %in% c(1, 2) ~ 1,
TRUE ~ 0))
I think base R can work good here. Select columns for which you want to check and take row wise sum of logical vector to calculate newVar.
df <- mtcars
cols <- c("vs", "am", "gear", "carb")
df$newVar <- +(rowSums(df[cols] == 1 | df[cols] == 2) > 0)
df
# mpg cyl disp hp drat wt qsec vs am gear carb newVar
#Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 1
#Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 1
#Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 1
#Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 1
#Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 1
#Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 1
#Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 0
#Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 1
#Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 1
#Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 1
#....
We can also use apply for row-wise manipulation
df$newVar <- +(apply(df[cols] == 1 | df[cols] == 2, 1, any))
We can use tidyverse option to create the column
library(dplyr)
library(purrr)
mtcars %>%
mutate(newVar = select(., vs:carb) %>%
map(~ .x %in% 1:2) %>%
reduce(`|`) %>%
as.integer)
#. mpg cyl disp hp drat wt qsec vs am gear carb newVar
#1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 1
#2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 1
#3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 1
#4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 1
#5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 1
#6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 1
#7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 0
#8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 1
# ...
Or with base R
nm1 <- c("vs", "am", "gear", "carb")
mtcars$newVar <- +(Reduce(`|`, lapply(mtcars[nm1], `%in%`, 1:2)))

Mutate with tidy evaluation

Using mutate_() I used to provide a list of new variables and the logic needed to create them.
library(dplyr)
library(rlang)
list_new_var <-
list(new_1 = "am * mpg",
new_2 = "cyl + disp")
mtcars %>%
mutate_(.dots = list_new_var) %>%
head()
Now I want to transition to using tidy evaluation. I am in the process of understanding the new methods.
How can I make this work? Will a function generally be recommended to solve this type of situation?
f_mutate <- function(data, new) {
a <- expr(new)
b <- eval(new)
c <- syms(new)
d <- UQ(syms(new))
e <- UQS(syms(new))
f <- UQE(syms(new))
data %>%
mutate(f) %>%
head()
}
f_mutate(mtcars, new = list_new_var)
One option would be to create a list with quote to return as an argument without evaluation
list_new_var <-list(
new_1 = quote(am * mpg),
new_2 = quote(cyl + disp)
)
and within the f_mutate, use the !!! to evaluate
f_mutate <- function(data, new) {
data %>%
mutate(!!! new)
}
run the function
f_mutate(mtcars, new = list_new_var) %>%
head
# mpg cyl disp hp drat wt qsec vs am gear carb new_1 new_2
#1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 21.0 166
#2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 21.0 166
#3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 22.8 112
#4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 0.0 264
#5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 0.0 368
#6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 0.0 231
I don't think you need a function for this. I think you just need the following
library(dplyr)
mtcars %>%
as_tibble() %>%
mutate(new_column1 = am * mpg,
new_column2 = cyl + disp) %>%
head()
Check out the first example here.

Programming dplyr operations

Any idea how I can manipulate dplyr variables programatically?
This works:
out = "new_var"
mtcars %>%
mutate(!!out := mpg/carb)
But I really need to be able to adjust the variables in the division. Thought I could do it like this:
out = "new_var"
numer = "mpg"
denom = "carb"
mtcars %>%
mutate(!!out := !! quo(numer/denom))
but no dice:
Error in mutate_impl(.data, dots) :
Evaluation error: non-numeric argument to binary operator.
result should look like:
mpg cyl disp hp drat wt qsec vs am gear carb new_var
1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 5.250000
2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 5.250000
3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 22.800000
4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 21.400000
5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 9.350000
6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 18.100000
7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 3.575000
8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 12.200000
...
Any idea how this works?
SOLVED -------------------------------------------------
myFunction = function(df, col, col2, new_col) {
col <- enquo(col)
col2 <- enquo(col2)
new_col <- quo_name(enquo(new_col))
df %>%
mutate(!!new_col := (!!col)/(!!col2))
}
myFunction(mtcars, mpg, wt, mpg_based_new_col)
If you want to make a quosure from a character value, you can use the rlang::sym() function (or just the base as.name() function). For example
out = "new_var"
numer = rlang::sym("mpg")
denom = rlang::sym("carb")
library(tidyverse)
mtcars %>%
mutate(!!out := (!!numer)/(!!denom))
Note how we escape each variable separately rather than the entire expression.

Create new data.table column by applying function on other columns

I want to create a new column in a data.table based on the values of other columns. Using mtcars as an example:
> library(data.table)
> dt <- as.data.table(mtcars)
> head(dt[, newval := cyl + gear])
mpg cyl disp hp drat wt qsec vs am gear carb newval
1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 10
2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 10
3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 8
4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 9
5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 11
6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 9
which works fine, but for even slightly more complex function, I get warning messages:
simple_func <- function(a, b){
if(a %in% c(4,6) ){
return(a*b)
}else{
return(b/a)
}
}
head(dt[, newval := simple_func(cyl, disp)])
returns:
> head(dt[, newval := simple_func(cyl, disp)])
mpg cyl disp hp drat wt qsec vs am gear carb newval
1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 960
2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 960
3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 432
4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 1548
5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 2880
6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 1350
Warning message:
In if (a %in% c(4, 6)) { :
the condition has length > 1 and only the first element will be used
the value for row 5 (cyl == 8) is clearly incorrect and expected value of newval is 45.
The reason is that the function is not being evaluated one row at a time but for the entire column and therefore if the condition is met for the first row (dt$cyl[1], dt$disp[1]), all other rows have the same formula appllied to them.
How do I get around this? I tried using .SDcols but didn't get it right and got other errors instead.
Use ifelse
simple_func <- function(a, b){
ifelse(a %in% c(4,6), a*b, b/a)
}

Resources