If/else condition in dplyr 0.7 function - r

I'd like to make a simple if/else condition in a dplyr function. I've looked at some helpful posts (e.g., How to parametrize function calls in dplyr 0.7?), but am still running into trouble.
Below is a toy example that works when I call the function without the grouping variable. The function then fails with the grouping variable.
# example dataset
test <- tibble(
A = c(1:5,1:5),
B = c(1,2,1,2,3,3,3,3,3,3),
C = c(1,1,1,1,2,3,4,5,4,3)
)
# begin function, set default for group var to NULL.
prop_tab <- function(df, column, group = NULL) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is NOT null, then...
if(!is.null(group)) {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
test %>% prop_tab(column = C) # works
test %>% prop_tab(column = A, group = B) # fails
# Error in prop_tab(., column = A, group = B) : object 'B' not found

The problem here is that when you supply unquoted arguments, is.null doesn't know what to do with it. So this code tries to check whether object B is null and errors because B does not exist in that scope. Instead, you can use missing() to check whether an argument was supplied to the function, like so. There may be a cleaner way but this at least works, as you can see at the bottom.
library(tidyverse)
test <- tibble(
A = c(1:5,1:5),
B = c(1,2,1,2,3,3,3,3,3,3),
C = c(1,1,1,1,2,3,4,5,4,3)
)
# begin function, set default for group var to NULL.
prop_tab <- function(df, column, group) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is not supplied, then:
if(!missing(group)) {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
test %>% prop_tab(column = C) # works
#> # A tibble: 5 x 2
#> col_name Percentages
#> <dbl> <dbl>
#> 1 1 40
#> 2 2 10
#> 3 3 20
#> 4 4 20
#> 5 5 10
test %>% prop_tab(column = A, group = B)
#> # A tibble: 3 x 2
#> B Percentages
#> <dbl> <dbl>
#> 1 1 20
#> 2 2 20
#> 3 3 60
Created on 2018-06-29 by the reprex package (v0.2.0).

You can use missing instead of is.null, so your argument won't be evaluated (that's what cause the error):
prop_tab <- function(df, column, group = NULL) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is NOT null, then...
if(!missing(group)) {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
test %>% prop_tab(column = C)
# example dataset
# # A tibble: 5 x 2
# col_name Percentages
# <dbl> <dbl>
# 1 1 40
# 2 2 10
# 3 3 20
# 4 4 20
# 5 5 10
test %>% prop_tab(column = A, group = B)
# # A tibble: 3 x 2
# B Percentages
# <dbl> <dbl>
# 1 1 20
# 2 2 20
# 3 3 60
You can also use length(substitute(group)) instead of !missing(group), it'll be more robust as it won't fail in the unlikely case where someone fills explicitely the group argument with NULL (the former option will crash in this case).

One option would be to check on the "group_name" instead of the 'group'
prop_tab <- function(df, column, group = NULL) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is NOT null, then...
if(as.character(group_name)[2] != "NULL") {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
-checking
prop_tab(test, column = C, group = B)
# A tibble: 3 x 2
#< B Percentages
# <dbl> <dbl>
#1 1 20
#2 2 20
#3 3 60
prop_tab(test, column = C)
# A tibble: 5 x 2
# col_name Percentages
# <dbl> <dbl>
#1 1 40
#2 2 10
#3 3 20
#4 4 20
#5 5 10

Related

cleaning time series based on previous timepoints

In my clincal dataset, I have a unique identifors by patient ID and time, and then the variable of interest that look like so:
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,NA,2,1,NA,1,3,NA,NA,1,NA)
Data <- data.frame(patientid=patientid, time=time, V1=V1)
Timepoint 3 is blank for each patient. I want to fill in timepoint three for each patient based on the following criteria. If at either time point 1 or 2 the variable is coded as a 2 or 3 then time point 3 should be coded as a 2. If at both time point 1 and 2, variable is coded as a 1 then time point point 3 should be coded as a one. If there is missing data at time point 1 or 2 then time point three should be missing. So for the toy expample it should look like this:
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,1,2,1,2,1,3,2,NA,1,NA)
Data <- data.frame(patientid=patientid, time=time, V1=V1)
You can use pivot_wider from tidyr to convert your data to wide format and you can mutate the 3 column with your logic using a function with the help of map from purrr package. You can return back to the original shape of the data frame using pivot-longer
library(tidyverse)
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,NA,2,1,NA,1,3,NA,NA,1,NA)
df <- data.frame(patientid=patientid, time=time, V1=V1)
flag <- function(t1,t2){
if(is.na(t1)|is.na(t2)){
NA
} else if(t1 %in% c(2,3)|t2 %in% c(2,3)){
2
} else if(t1 == 1|t2 == 1){
1
}else {
NA
}
}
df %>%
as_tibble() %>%
pivot_wider(names_from = time, values_from = V1) %>%
mutate(`3` = pmap_dbl(list(`1`,`2`),flag )) %>%
pivot_longer(-1, names_to = "time", values_to = "V1")
#> # A tibble: 12 x 3
#> patientid time V1
#> <dbl> <chr> <dbl>
#> 1 100 1 1
#> 2 100 2 1
#> 3 100 3 1
#> 4 101 1 2
#> 5 101 2 1
#> 6 101 3 2
#> 7 102 1 1
#> 8 102 2 3
#> 9 102 3 2
#> 10 104 1 NA
#> 11 104 2 1
#> 12 104 3 NA
Created on 2021-01-29 by the reprex package (v0.3.0)
This should do it!
library(tidyverse)
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,NA,2,1,NA,1,3,NA,NA,1,NA)
Data <- data.frame(patientid=patientid, time=time, V1=V1)
Data <- Data %>% pivot_wider(names_from = "time", values_from = "V1",
names_prefix = "timepoint_")
timepoint_impute <- function(x,y) {
if(is.na(x) | is.na(y)) {
return(NA)
} else if(2 %in% c(x,y) | 3 %in% c(x,y)) {
return(2)
} else if(x==1 & y==1) {
return(1)
}
}
Data$timepoint_3 <- map2(.x = Data$timepoint_1, .y = Data$timepoint_2,
.f = timepoint_impute)
You end up with wide data format but if you need long data format you can just use tidyr::pivot_longer. This approach writes a custom function to handle the logic you need.

Reshape dataframe so that matching family members have their own column

I have a dataframe...
df <- tibble(
id = 1:5,
family = c("a","a","b","b","c"),
twin = c(1,2,1,2,1),
datacol1 = 11:15,
datacol2 = 21:25
)
For every twin pair (members of the same family) I need to have a second 'datacol' with the other twins' data. This should only happen for matching twins, so the 5th row (from family "c") should have duplicate columns that are empty.
Ideally, by the end the data would look like the following...
df <- tibble(
id = 1:5,
family = c("a","a","b","b","c"),
twin = c(1,2,1,2,1),
datacol1 = 11:15,
datacol1.b = c(12,11,14,13,NA),
datacol2 = 21:25,
datacol2.b = c(22,21,24,23,NA)
)
I have added an image to help illustrate what I am trying to get to.
I would like to be able to do this for all columns or for selected columns and preferably using tidyverse.
We can also use mutate_at
library(dplyr)
df %>%
group_by(family) %>%
mutate_at(vars(starts_with('datacol')), list(`2` =
~if(n() == 1) NA_integer_ else rev(.)))
# A tibble: 5 x 7
# Groups: family [3]
# id family twin datacol1 datacol2 datacol1_2 datacol2_2
# <int> <chr> <dbl> <int> <int> <int> <int>
#1 1 a 1 11 21 12 22
#2 2 a 2 12 22 11 21
#3 3 b 1 13 23 14 24
#4 4 b 2 14 24 13 23
#5 5 c 1 15 25 NA NA
cols = c("datacol1", "datacol2")
df %>%
group_by(family) %>%
mutate_at(vars(cols), function(x){
if (n() == 2){
rev(x)
} else {
NA
}
}) %>%
ungroup() %>%
select(cols) %>%
rename_all(funs(paste0(., ".b"))) %>%
cbind(df, .)
Base R
cols = c("datacol1", "datacol2")
do.call(rbind, lapply(split(df, df$family), function(x){
cbind(x, setNames(lapply(x[cols], function(y) {
if (length(y) == 2) {
rev(y)
} else {
NA
}}),
paste0(cols, ".b")))
}))

map over columns and apply custom function

Missing something small here and struggling to pass columns to function. I just want to map (or lapply) over columns and perform a custom function on each of the columns. Minimal example here:
library(tidyverse)
set.seed(10)
df <- data.frame(id = c(1,1,1,2,3,3,3,3),
r_r1 = sample(c(0,1), 8, replace = T),
r_r2 = sample(c(0,1), 8, replace = T),
r_r3 = sample(c(0,1), 8, replace = T))
df
# id r_r1 r_r2 r_r3
# 1 1 0 0 1
# 2 1 0 0 1
# 3 1 1 0 1
# 4 2 1 1 0
# 5 3 1 0 0
# 6 3 0 0 1
# 7 3 1 1 1
# 8 3 1 0 0
a function just to filter and counts unique ids remaining in the dataset:
cnt_un <- function(var) {
df %>%
filter({{var}} == 1) %>%
group_by({{var}}) %>%
summarise(n_uniq = n_distinct(id)) %>%
ungroup()
}
it works outside of map
cnt_un(r_r1)
# A tibble: 1 x 2
r_r1 n_uniq
<dbl> <int>
1 1 3
I want to apply the function over all r_r columns to get something like:
df2
# y n_uniq
# 1 r_r1 3
# 2 r_r2 2
# 3 r_r3 2
I thought the following would work but doesnt
map(dplyr::select(df, matches("r_r")), ~ cnt_un(.x))
any suggestions? thanks
I'm not sure if there's a direct tidyeval way to do this with something like map. The issue you're running into is that in calling map(df, *whatever_function*), the function is being called on each column of df as a vector, whereas your function expects a bare column name in the tidyeval style. To verify that:
map(df, class)
will return "numeric" for each column.
An alternative is to iterate over column names as strings, and convert those to symbols; this takes just one additional line in the function.
library(dplyr)
library(tidyr)
library(purrr)
cnt_un_name <- function(varname) {
var <- ensym(varname)
df %>%
filter({{var}} == 1) %>%
group_by({{var}}) %>%
summarise(n_uniq = n_distinct(id)) %>%
ungroup()
}
Calling the function is a little awkward because it keeps only the relevant column names (calling on "r_r1" gets columns "r_r1" and "n_uniq", etc). One way is to get the vector of column names you want, name it so you can add an ID column in map_dfr, and drop the extra columns, since they'll be mostly NA.
grep("^r_r\\d+", names(df), value = TRUE) %>%
set_names() %>%
map_dfr(cnt_un_name, .id = "y") %>%
select(y, n_uniq)
#> # A tibble: 3 x 2
#> y n_uniq
#> <chr> <int>
#> 1 r_r1 3
#> 2 r_r2 2
#> 3 r_r3 2
A better way is to call the function, then bind after reshaping.
grep("^r_r\\d+", names(df), value = TRUE) %>%
map(cnt_un_name) %>%
map_dfr(pivot_longer, 1, names_to = "y") %>%
select(y, n_uniq)
# same output as above
Alternatively (and maybe better/more scaleable) would be to do the column renaming inside the function definition.
Here's a base R solution that uses lapply. The tricky bit is that your function isn't actually running on single columns; it's using id, too, so you can't use canned functions that iterate column-wise.
do.call(rbind, lapply(grep("r_r", colnames(df), value = TRUE), function(i) {
X <- subset(df, df[,i] == 1)
row <- data.frame(y = i, n_uniq = length(unique(X$id)), stringsAsFactors = FALSE)
}))
y n_uniq
1 r_r1 2
2 r_r2 3
3 r_r3 2
Here is another solution. I changed the syntax of your function. Now you supply the pattern of the columns you want to select.
cnt_un <- function(var_pattern) {
df %>%
pivot_longer(cols = contains(var_pattern), values_to = "vals", names_to = "y") %>%
filter(vals == 1) %>%
group_by(y) %>%
summarise(n_uniq = n_distinct(id)) %>%
ungroup()
}
cnt_un("r_r")
#> # A tibble: 3 x 2
#> y n_uniq
#> <chr> <int>
#> 1 r_r1 2
#> 2 r_r2 3
#> 3 r_r3 2

R - dplyr. Functions with variable similar to dataframe columns

I have this case where I am filtering on a dataframe in a function, but the dataframe has the column with a similar name as the variable I want to filter on.
example:
d = tibble(cond = c(1,2), b = c(1,2))
f_ = function(data, cond) {
data = data %>% filter(b == cond)
return(data)
}
f_(d, cond = 2)
# A tibble: 2 x 2
cond b
<dbl> <dbl>
1 1 1
2 2 2
No filtering happens (because here cond is equal to b).
this becomes an issue when I do not control the number of columns in the data but at the minimum I know it has the b column.
We can change the function to evaluate the 'cond' not from the environment
f_ = function(data, cond) {
data %>%
filter(b == !!cond)
}
f_(d, cond = 2)
# A tibble: 1 x 2
# cond b
# <dbl> <dbl>
#1 2 2

Filter Dataframe with Tidy Evaluation

I am handling a large dataset. First, for certain columns (X1, X2, ...), I am trying to identify a range of value (a, b) consists of repeated value (a > n, b > n). Next, I wish to filter row based on the condition which matches respective columns to result given in the previous step.
Here is a reproducible example simulating the scenario I am facing,
library(tidyverse)
set.seed(1122)
vecs <- lapply(X = 1:2, function(x) rep(c(1, 2, 3), times = 10) %>% sample() %>% head(10))
names(vecs) <- paste0("col_", 1:2)
dat <- vecs %>% as.data.frame()
dat
col_1 col_2
1 3 2
2 1 1
3 1 1
4 1 2
5 1 2
6 3 3
7 3 3
8 2 1
9 1 3
10 2 2
I am able to identify the range by the following method,
# Which col has repeated value more than 3 appearances?
more_than_3 <- function(df, var){
var <- rlang::sym(var)
df %>%
group_by(!!var) %>%
summarise(n = n()) %>%
filter(n > 3) %>%
pull(!!var) %>%
range()
}
cols_name <- c("col_1", "col_2")
some_range <- purrr::map(cols_name, more_than_3, df = dat)
names(some_range) <- cols_name
some_range
$col_1
[1] 1 1
$col_2
[1] 2 2
However, to filter out values that fall outside the upper limit, this is what I do.
dat %>%
filter(col_1 <= some_range[["col_1"]][2],
col_2 <= some_range[["col_2"]][2])
col_1 col_2
1 1 1
2 1 1
3 1 2
4 1 2
I believe there must be a more efficient and elegant way of filtering the result based on tidy evaluation. Can someone point me to the right direction?
Many thanks in advance.
First let's try to create a small function that creates a single filter expression for one column. This function will take a symbol and then transform to string but it could be the other way around:
new_my_filter_call_upper <- function(sym, range) {
col_name <- as.character(sym)
col_range <- range[[col_name]]
if (is.null(col_range)) {
stop(sprintf("Can't find column `%s` to compute range", col_name), call. = FALSE)
}
expr(!!sym < !!col_range[[2]])
}
Let's try it:
new_my_filter_call_upper(quote(foobar), some_range)
#> Error: Can't find column `foobar` to compute range
# It works!
new_my_filter_call_upper(quote(col_1), some_range)
#> col_1 < 3
Now we're ready to create a pipeline verbs that will take a data frame and bare column names.
# Probably cleaner to pass range as argument. Prefix with dot to allow
# columns named `range`.
my_filter <- function(.data, ..., .range) {
# ensyms() guarantees there won't be complex expressions
syms <- rlang::ensyms(...)
# Now let's map the function to create many calls:
calls <- purrr::map(syms, new_my_filter_call_upper, range = .range)
# And we're ready to filter with those expressions:
dplyr::filter(.data, !!!calls)
}
Let's try it:
dat %>% my_filter(col_1, col_2, .range = some_range)
#> col_1 col_2 NA.
#> 1 2 1 1
#> 2 2 2 1
We could use map2
library(purrr)
map2(dat, some_range, ~ .x < .y[2]) %>%
reduce(`&`) %>%
dat[.,]
# col_1 col_2
#1 2 2
#2 1 1
#3 1 2
#6 1 1
Or with pmap
pmap(list(dat, some_range %>%
map(2)), `<`) %>%
reduce(`&`) %>%
dat[.,]

Resources