Making tidyeval function inside case_when - r

I have a data set that I like to impute one value among others based on probability distribution of those values. Let make some reproducible example first
library(tidyverse)
library(janitor)
dummy1 <- runif(5000, 0, 1)
dummy11 <- case_when(
dummy1 < 0.776 ~ 1,
dummy1 < 0.776 + 0.124 ~ 2,
TRUE ~ 5)
df1 <- tibble(q1 = dummy11)
here is the output:
df1 %>% tabyl(q1)
q1 n percent
1 3888 0.7776
2 605 0.1210
5 507 0.1014
I used mutate and sample to share value= 5 among value 1 and 2 like this:
df1 %>%
mutate(q1 = case_when(q1 == 5 ~ sample(
2,
length(q1),
prob = c(0.7776, 0.1210),
replace = TRUE
),
TRUE ~ as.integer(q1))
)
and here is the result :
q1 n percent
1 4322 0.8644
2 678 0.1356
This approach seems working, however since I need to apply this for several variables I tried to write a function that working with tidyverse with tidyeval, like this
my_impute <- function(.data, .prob_var, ...) {
.prob_var <- enquo(.prob_var)
.data %>%
sample(2, prob=c(!!.prob_var), replace = TRUE)
}
# running on data
df1 %>%
mutate(q1 = case_when(q1 == 5 ~ !!my_impute(q1),
TRUE ~ as.integer(q1))
)
The error is :
Error in eval_tidy(pair$lhs, env = default_env) : object 'q1' not found

We need the prob values from the 'percent' column generated from tabyl, so the function can be modified to
library(janitor)
library(dplyr)
my_impute <- function(.data, .prob_var, vals, ...) {
.prob_var = enquo(.prob_var)
.prob_vals <- .data %>%
janitor::tabyl(!!.prob_var) %>%
filter(!!.prob_var %in% vals) %>%
pull(percent)
.data %>%
mutate(!! .prob_var := case_when(!! .prob_var == 5 ~
sample(
2,
n(),
prob = .prob_vals,
replace = TRUE
),
TRUE ~ as.integer(q1))
)
}
df1 %>%
my_impute(q1, vals = 1:2) %>%
tabyl(q1)
# q1 n percent
# 1 4285 0.857
# 2 715 0.143

Just to add my two cents, the new version of rlang allows to replace the quasiquotation process: enquo() + !! and you can use curly-curly to embrace variables: The function would be like:
my_impute <- function(.data, .prob_var, vals, ...) {
#.prob_var = enquo(.prob_var)
# commented out since it is no longer needed
.prob_vals <- .data %>%
janitor::tabyl({{.prob_var}}) %>%
filter({{.prob_var}} %in% {{vals}}) %>%
pull(percent)
.data %>%
mutate( {{.prob_var}} := case_when( {{.prob_var}} == 5 ~
sample(
2,
n(),
prob = {{.prob_vals}},
replace = TRUE
),
TRUE ~ as.integer(q1))
)
}

Related

Is it possible to conditionally format rows while using as_grouped_data with flextable?

I'm trying to conditionally format rows after calling as_grouped_data basing the conditions on the grouped rows:
library(tidyverse)
library(flextable)
df <- tibble(vStat = c(rep("Average Degree", 3), rep("Average Weight", 3)),
val = c(1.22222, 1.33333, 1.44444, 1.55555, 1.66666, 1.77777))
flextable(df %>%
as_grouped_data(groups="vStat")) %>%
colformat_double(i = ~ vStat=="Average Degree", digits=1) %>% # not working
colformat_double(i = ~ vStat=="Average Weight", digits=3) %>% # not working
autofit()
I understand that the above doesn't work because the condition in colformat_double only applies to rows where val is now NA:
df %>%
as_grouped_data(groups="vStat")
> vStat val
> 1 Average Degree NA
> 3 <NA> 1.22222
> 4 <NA> 1.33333
> 5 <NA> 1.44444
> 2 Average Weight NA
> 6 <NA> 1.55555
> 7 <NA> 1.66666
> 8 <NA> 1.77777
It doesn't seem to work like grouped data normally would when calling first:
flextable(df %>%
as_grouped_data(groups="vStat")) %>%
colformat_double(i = ~ first(vStat=="Average Degree"), digits=1) %>%
colformat_double(i = ~ first(vStat=="Average Weight"), digits=3) %>%
autofit()
> Error in get_rows_id(x[[part]], i) : invalid row selection: length(i) [1] != nrow(dataset) [8]
Rounding in the dataset before grouping doesn't get me what I want either, with the number of digits still going out to the highest condition and getting filled in with zeros:
flextable(df %>%
mutate(val = case_when(vStat=="Average Degree" ~ round(val, 1),
vStat=="Average Weight" ~ round(val, 3))) %>%
as_grouped_data(groups="vStat")) %>%
autofit()
I'd really like to not have to specify individual row numbers in colformat_double in a table with 50 rows when my data change every day.
We could create an index column or duplicate the same column 'vStat' with another name and do the condition on the index or use the same code on the other column and remove it later
library(dplyr)
library(flextable)
flextable(df %>%
mutate(ind = match(vStat, unique(vStat))) %>%
as_grouped_data(groups="vStat")) %>%
colformat_double(i = ~ ind == 1, digits=1) %>% # not working
colformat_double(i = ~ ind == 2, digits=3) %>% # not working
void(j = ~ind) %>%
compose( j = 3, value = as_paragraph(""), part = "header") %>%
autofit()
-output

Unexpected behavior with case_when and is.na

I want to change all NA values in a column to 0 and all other values to 1. However, I can't get the combination of case_when and is.na to work.
# Create dataframe
a <- c(rep(NA,9), 2, rep(NA, 10))
b <- c(rep(NA,9), "test", rep(NA, 10))
df <- data.frame(a,b, stringsAsFactors = F)
# Create new column (c), where all NA values in (a) are transformed to 0 and other values are transformed to 1
df <- df %>%
mutate(
c = case_when(
a == is.na(.$a) ~ 0,
FALSE ~ 1
)
)
I expect column (c) to indicate all 0 values and one 1 value, but its all 0's.
It does work when I use an if_else statement with is.na, like:
df <- df %>%
mutate(
c = if_else(is.na(a), 0, 1))
)
What is going on here?
You should be doing this instead:
df %>%
mutate(
c = case_when(
is.na(a) ~ 0,
TRUE ~ 1
)
)

Apply function over data frame rows

I'm trying to apply a function over the rows of a data frame and return a value based on the value of each element in a column. I'd prefer to pass the whole dataframe instead of naming each variable as the actual code has many variables - this is a simple example.
I've tried purrr map_dbl and rowwise but can't get either to work. Any suggestions please?
#sample df
df <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31))
#required result
Res <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31),
NewVal=c(10,500,800,230,3100)
)
#use mutate and map or rowwise etc
Res <- df %>%
mutate(NewVal=map_dbl(.x=.,.f=FnAdd(.)))
Res <- df %>%
rowwise() %>%
mutate(NewVal=FnAdd(.))
#sample fn
FnAdd <- function(Data){
if(Data$Y=="A"){
X=Data$X*10
}
if(Data$Y=="B"){
X=Data$X*100
}
return(X)
}
If there are multiple values, it is better to have a key/val dataset, join and then do the mulitiplication
keyVal <- data.frame(Y = c("A", "B"), NewVal = c(10, 100))
df %>%
left_join(keyVal) %>%
mutate(NewVal = X*NewVal)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100
It is not clear how many unique values are there in the actual dataset 'Y' column. If we have only a few values, then case_when can be used
FnAdd <- function(Data){
Data %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X *100,
TRUE ~ X))
}
FnAdd(df)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100
You were originally looking for a solution using dplyr's rowwise() function, so here is that solution. The nice thing about this approach is that you don't need to create a separate function.
Here's the version using if()
df %>%
rowwise() %>%
mutate(NewVal = ifelse(Y == "A", X * 10,
ifelse(Y == "B", X * 100)))
and here's the version using case_when:
df %>%
rowwise() %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X * 100))

Grouped operation on all groups relative to "baseline" group, with multiple observations

Starting with data containing multiple observations for each group, like this:
set.seed(1)
my.df <- data.frame(
timepoint = rep(c(0, 1, 2), each= 3),
counts = round(rnorm(9, 50, 10), 0)
)
> my.df
timepoint counts
1 0 44
2 0 52
3 0 42
4 1 66
5 1 53
6 1 42
7 2 55
8 2 57
9 2 56
To perform a summary calculation at each timepoint relative to timepoint == 0, for each group I need to pass a vector of counts for timepoint == 0 and a vector of counts for the group (e.g. timepoint == 0) to an arbitrary function, e.g.
NonsenseFunction <- function(x, y){
(mean(x) - mean(y)) / (1 - mean(y))
}
I can get the required output from this table, either with dplyr:
library(dplyr)
my.df %>%
group_by(timepoint) %>%
mutate(rep = paste0("r", 1:n())) %>%
left_join(x = ., y = filter(., timepoint == 0), by = "rep") %>%
group_by(timepoint.x) %>%
summarise(result = NonsenseFunction(counts.x, counts.y))
or data.table:
library(data.table)
my.dt <- data.table(my.df)
my.dt[, rep := paste0("r", 1:length(counts)), by = timepoint]
merge(my.dt, my.dt[timepoint == 0], by = "rep", all = TRUE)[
, NonsenseFunction(counts.x, counts.y), by = timepoint.x]
This only works if the number of observations between groups is the same. Anyway, the observations aren't matched, so using the temporary rep variable seems hacky.
For a more general case, where I need to pass vectors of the baseline values and the group's values to an arbitrary (more complicated) function, is there an idiomatic data.table or dplyr way of doing so with a grouped operation for all groups?
Here's the straightforward data.table approach:
my.dt[, f(counts, my.dt[timepoint==0, counts]), by=timepoint]
This probably grabs my.dt[timepoint==0, counts] again and again, for each group. You could instead save that value ahead of time:
v = my.dt[timepoint==0, counts]
my.dt[, f(counts, v), by=timepoint]
... or if you don't want to add v to the environment, maybe
with(list(v = my.dt[timepoint==0, counts]),
my.dt[, f(counts, v), by=timepoint]
)
You could give the second argument to use the vector from your group of interest as a constant.
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, my.df$counts[my.df$timepoint == 0]))
Or if you want to make it beforehand:
constant = = my.df$counts[my.df$timepoint == 0]
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, constant))
You can try,
library(dplyr)
my.df %>%
mutate(new = mean(counts[timepoint == 0])) %>%
group_by(timepoint) %>%
summarise(result = NonsenseFunction(counts, new))
# A tibble: 3 × 2
# timepoint result
# <dbl> <dbl>
#1 0 0.0000000
#2 1 0.1398601
#3 2 0.2097902

Hot deck imputation in dplyr

I'm trying to do a hot deck imputation in R with the dplyr package. I have non-finite values that I would like to replace with a random value drawn from within the same group.
myData <- data.frame(value = sample(c(Inf, NaN, 1:8), 100, replace=TRUE),
group = sample(letters[1:4], 100, replace=TRUE))
value group
1 4 c
2 6 d
3 Inf c
4 8 c
5 7 a
6 2 b
This code runs but also samples the Inf and NaN values.
myData <- myData %>%
group_by(group) %>%
mutate(imputedvalue = sample(value, n(), replace = TRUE))
table(is.finite(myData$imputedvalue), is.infinite(myData$imputedvalue))
FALSE TRUE
FALSE 16 7
TRUE 77 0
This code doesn't run.
myData <- myData %>%
group_by(group) %>%
mutate(imputedvalue = ifelse(is.finite(value), value,
sample(value, n(), replace = TRUE)))
Error in n() : This function should not be called directly
I feel like there should be a filter() command of some sort, but I don't really see how this should work...
Here is an approach that involves splitting the dataset up first:
# filter non-infinite records
myDataOK <- myData %>%
filter(value %>% is.finite)
# how many replacements are needed?
# sample these, a la #eddi
myDataimputed <- myData %>%
group_by(group) %>%
summarise(n_inf = sum(!is.finite(value))) %>%
group_by(group) %>%
do(sample_n(filter(myDataOK,group == .$group),size = .$n_inf,replace = TRUE))
## and combine!
myData2 <- rbind(myDataOK,myDataimputed)
## here are some various checks:
## same size as original?
nrow(myData2) == nrow(myData)
## all infinites replaced?
with(myData2,table(is.finite(value), is.infinite(value)))
## should be no *decreases* after shuffling.
## value x block combinations might increase but should never decrease
check1 <- myDataOK %>%
group_by(group,value) %>%
tally %>%
arrange(group,value)
check2 <- myData2 %>%
group_by(group,value) %>%
tally %>%
arrange(group,value)
if(any((check2$n-check1$n) < 0)) stop("something went wrong!")
## finally, the increases in group size should equal the number of missing values
Ninf <- myData %>%
group_by(group) %>%
summarise(n_inf = sum(!is.finite(value)))
if(any(tally(check2)$n - tally(check1)$n - Ninf$n_inf !=0) )
stop("group sizes changed!")

Resources