group_by and apply code to each element in group using conditions - r

I have data like so:
ID membership AdultChild
1 1 A
2 1 A
3 2 A
4 2 C
5 2 C
6 3 A
7 3 A
: : :
I want to group by membership and apply a 'code' after counting the AdultChild variable i.e.
ID membership AdultChild code
1 1 A x1
2 1 A x1
3 2 A x2
4 2 C x2
5 2 C x2
6 3 A x1
7 3 A x1
: : : :
I will have conditions similar to:
count <- function(x){
if(sum(x == "A") == 2 && sum(x == "C") == 0){
code <<- x1
}else if (sum(x == "A") == 1 & sum(x == "C") >= 1){
code <<- x2
}else {
code <<- X3
}
I have tried using dplyr to group and mutate, using the function above to add a new variable called code. I also thought about using the aggregate function but didn't have much luck.
df.2 <- df %>% group_by(membership)
%>% mutate(n = count(AdultChild)) %>%
ungroup()
df.2 <- aggregate.data.frame(df, by = membership, FUN =
count(df$AdultChild))
Basically, I want a new variable which be decided using certain conditions and applied to each ID when grouped by membership.
Thanks in advance.

library(dplyr)
df %>% group_by(membership) %>%
mutate(code=case_when(
sum(AdultChild=='A', na.rm = T)==2 & sum(AdultChild=='C', na.rm = T)==0 ~ 'X1',
sum(AdultChild=='A', na.rm = T)==1 & sum(AdultChild=='C', na.rm = T)>=1 ~ 'X2',
TRUE ~ 'X3'
))
# A tibble: 7 x 4
# Groups: membership [3]
ID membership AdultChild code
<int> <int> <fct> <chr>
1 1 1 A X1
2 2 1 A X1
3 3 2 A X2
4 4 2 C X2
5 5 2 C X2
6 6 3 A X1
7 7 3 A X1

count <- function(x){
if(sum(x == "A", na.rm = T) == 2 & sum(x == "C", na.rm = T) == 0){
y <- "4"
} else if (sum(x == "A", na.rm = T) > 2 & sum(x == "C", na.rm = T) == 0){
y <- "5"
}else if (sum(x == "A", na.rm = T) == 1 & sum(x == "C", na.rm = T) >= 1){
y <- "6"
}else if (sum(x == "A", na.rm = T) == 2 & sum(x == "C", na.rm = T) <= 3 & sum(x == "C", na.rm = T) >= 1){
y <- "7"
}else {
y <- "8"
}
}
df.2 <- df %>% group_by(membership) %>% mutate(code = count(AdultChild)) %>% ungroup()

Related

User Defined Function with Case When Reference

I'm trying to define a function that references a value in another cell according to its row (if that makes sense). Some generalized data would look like this:
col1 col2 col3
A 1 B
B 2 C
C 6 NA
My goal is the below, where "calc" is a sum of col2 from the respective row and the col2 value from the referenced row
col1 col2 calc
A 1 3
B 2 8
C 6 6
letters <- c("A","B","C")
calc<- case_when(data$col1 == "A" ~ subset(data$col2, data$col3 == "A"),
data$col1 == "B" ~ subset(data$col2, data$col3 == "B"),
data$col1 == "C" ~ subset(data$col2, data$col3 == "C"))
totals<- data.frame(cbind(data$col2,calc)) %>% rowSums(., na.rm = TRUE)
data.frame(cbind(data$col1,totals))
I'm not sure how to turn this into a function -- I tried the below,
udfunction<- function(x){
calc<- case_when(data$col1 == x ~ subset(data$col2, data$col3 == x))
totals<- data.frame(cbind(data$col1,data$col2,calc)) %>% rowSums(., na.rm = TRUE)
data.frame(cbind(data$col1,totals))
}
udfunction(letters)
then got the error:
In col1 == x : longer object length is not a multiple of shorter object length
udfunction(letters)
Any help would be very appreciated!
transform(df, calc = col2 + c(0, col2)[match(col3, col1, 0) + 1])
col1 col2 col3 calc
1 A 1 B 3
2 B 2 C 8
3 C 6 <NA> 6

Generate random binary variable conditionally in R

I would like to add an extra column, z based on the following conditions:
if x == "A", generate a binary variable assuming the prob of success (=1) is 0.5
if x == "C" & y == "N", generate a binary variable assuming the prob of success is 0.25.
# Sample data
df <- tibble(
x = ("A", "C", "C", "B", "C", "A", "A"),
y = ("Y", "N", "Y", "N", "N", "N", "Y"))
Currently, my approach uses filter, then set.seed and rbinom, and finally rbind. But I am looking for a more elegant solution that doesn't involve subseting and re-joining the data.
This is a good case for dplyr::case_when since you are using tidyverse functions.
library(dplyr)
set.seed(1)
df %>%
mutate(z = case_when(x == "A" ~ rbinom(n(), 1, 0.5),
x == "C" & y == "N" ~ rbinom(n(), 1, 0.25)))
# A tibble: 7 x 3
# Rowwise:
x y z
<chr> <chr> <int>
1 A Y 0
2 C N 1
3 C Y NA
4 B N NA
5 C N 0
6 A N 0
7 A Y 1
You may put your logic into a simple if / else structure and wrap it in a function g().
g <- \(z) {
if (z['x'] == 'A') {
rbinom(1, 1, .5)
}
else if (z['x'] == 'C' & z['y'] == 'N') {
rbinom(1, 1, .25)
} else {
NA
}
}
set.seed(42)
transform(df, z=apply(df, 1, g))
# x y z
# 1 A Y 1
# 2 C N 1
# 3 C Y NA
# 4 B N NA
# 5 C N 0
# 6 A N 1
# 7 A Y 1
You can try nested ifelse like below
transform(
df,
z = suppressWarnings(
rbinom(
nrow(df), 1,
ifelse(x == "A", 0.5,
ifelse(x == "C" & y == "N", 0.25, NA)
)
)
)
)
which gives
x y z
1 A Y 1
2 C N 0
3 C Y NA
4 B N NA
5 C N 1
6 A N 1
7 A Y 1

Put column name as a value across several columns, under condition

I have a dataset looking like that:
set.seed(123)
test_data <- data.frame(
id = c("a", "b", "c", "d", "e"),
x = sample(c(0,1), 5, replace = T),
y = sample(c(0,1), 5, replace = T)
)
> test_data
id x y
1 a 0 1
2 b 0 1
3 c 0 1
4 d 1 0
5 e 0 0
For the columns x and y, if the value is equal to 1, the value is replaced by the name of the column. In my example, I would like to have:
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>
The thing that I don't know how many columns should be treated this way. Basically, I know that the first column ("id") is not concerned, but after this column, I could have any number of columns (even 0) that need to be treated this way.
I tried something like that but it doesn't work:
library(dplyr)
test_data %>%
mutate(
across(
.cols = 1:last_col(),
.funs = function(x) {
ifelse(x == 1, as.character(x), NA)
}
)
)
How can I do that? A dplyr answer is preferred.
You can do this way also:
library(tidyverse)
## define a function for your job
fn <- function(x, name){
return(ifelse(x ==1, name, NA))
}
test_data %>%
select(-id) %>%
map2_dfr(., names(.), ~fn(.x, .y)) %>%
bind_cols('id'= test_data$id, .)
Another version could be:
fn <- function(x, name){
if(name != 'id'){
return(ifelse(x ==1, name, NA))
} else {
return(x)
}
}
test_data %>%
map2_dfr(., names(.), ~fn(.x, .y))
Here is a try using purrr
library(dplyr)
library(purrr)
col_to_fix <- names(test_data)[2:length(test_data)]
walk(.x = col_to_fix, .f = function(x) {
# Note that I used <<- assigment here to change the test_data in global
test_data[[x]] <<- case_when(
test_data[[x]] == 1 ~ x,
TRUE ~ NA_character_
)
})
Output
> test_data
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>

How do I see how many combinations of specific values there are in columns

I have a dataset that looks like this:
Group ID
UP 1
UP 1
UP 2
UP 2
UP 2
UP 1
UP 1
UP 2
UP 2
UP 1
UP 1
Is there any way to see how many times a 1 is under a 1 in the ID column?
Does this work:
library(dplyr)
df %>% mutate(flag = case_when(ID == 1 & lag(ID) == 1 ~ 1, TRUE ~ 0)) %>% pull(flag) %>% sum
[1] 3
Base R :
sum(df$ID == 1 & c(tail(df$ID, -1), NA) == 1, na.rm = TRUE)
#[1] 3
You can also use dplyr::lag and data.table::shift
sum(df$ID == 1 & dplyr::lag(df$ID) == 1, na.rm = TRUE)
sum(df$ID == 1 & data.table::shift(df$ID) == 1, na.rm = TRUE)

Returning a tibble: how to vectorize with case_when?

I have a function which returns a tibble. It runs OK, but I want to vectorize it.
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
if(xx >= 4){
tibble(x = NA, y = NA)
} else if(xx == 3){
tibble(x = as.integer(), y = as.integer())
} else if (xx == 2){
tibble(x = xx^2 - 1, y = yy^2 -1)
} else {
tibble(x = xx^2, y = yy^2)
}
}
It runs OK in a mutate when I call it with map2, giving me the result I wanted:
tibTest %>%
mutate(sq = map2(argX, argY, square_it)) %>%
unnest()
## A tibble: 3 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 2 6 3 35
# 3 4 4 NA NA
My first attempt to vectorize it failed, and I can see why - I can't return a vector of tibbles.
square_it2 <- function(xx, yy){
case_when(
x >= 4 ~ tibble(x = NA, y = NA),
x == 3 ~ tibble(x = as.integer(), y = as.integer()),
x == 2 ~ tibble(x = xx^2 - 1, y = yy^2 -1),
TRUE ~ tibble(x = xx^2, y = yy^2)
)
}
# square_it2(4, 2) # FAILS
My next attempt runs OK on a simple input. I can return a list of tibbles, and that's what I want for the unnest
square_it3 <- function(xx, yy){
case_when(
xx >= 4 ~ list(tibble(x = NA, y = NA)),
xx == 3 ~ list(tibble(x = as.integer(), y = as.integer())),
xx == 2 ~ list(tibble(x = xx^2 - 1, y = yy^2 -1)),
TRUE ~ list(tibble(x = xx^2, y = yy^2))
)
}
square_it3(4, 2)
# [[1]]
# # A tibble: 1 x 2
# x y
# <lgl> <lgl>
# 1 NA NA
But when I call it in a mutate, it doesn't give me the result I had with square_it. I can sort of see what's
wrong. In the xx == 2 clause, xx acts as an atomic value of 2. But in
building the tibble, xx is a length-4 vector.
tibTest %>%
mutate(sq = square_it3(argX, argY)) %>%
unnest()
# # A tibble: 9 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 1 7 4 36
# 3 1 7 9 25
# 4 1 7 16 16
# 5 2 6 0 48
# 6 2 6 3 35
# 7 2 6 8 24
# 8 2 6 15 15
# 9 4 4 NA NA
How do I get the same result as I did with square_it, but from a vectorized function using case_when ?
We define row_case_when which has a similar formula interface as case_when except it has a first argument of .data, acts by row and expects that the value of each leg to be a data frame. It returns a data.frame/tibble. Wrapping in a list, rowwise and unnest are not needed.
case_when2 <- function (.data, ...) {
fs <- dplyr:::compact_null(rlang:::list2(...))
n <- length(fs)
if (n == 0) {
abort("No cases provided")
}
query <- vector("list", n)
value <- vector("list", n)
default_env <- rlang:::caller_env()
quos_pairs <- purrr::map2(fs, seq_along(fs), dplyr:::validate_formula,
rlang:::default_env, rlang:::current_env())
for (i in seq_len(n)) {
pair <- quos_pairs[[i]]
query[[i]] <- rlang::eval_tidy(pair$lhs, data = .data, env = default_env)
value[[i]] <- rlang::eval_tidy(pair$rhs, data = .data, env = default_env)
if (!is.logical(query[[i]])) {
abort_case_when_logical(pair$lhs, i, query[[i]])
}
if (query[[i]]) return(value[[i]])
}
}
row_case_when <- function(.data, ...) {
.data %>%
group_by(.group = 1:n(), !!!.data) %>%
do(case_when2(., ...)) %>%
mutate %>%
ungroup %>%
select(-.group)
}
Test run
It is used like this:
library(dplyr)
tibTest <- tibble(argX = 1:4, argY = 7:4) # test data from question
tibTest %>%
row_case_when(argX >= 4 ~ tibble(x = NA, y = NA),
argX == 3 ~ tibble(x = as.integer(), y = as.integer()),
argX == 2 ~ tibble(x = argX^2 - 1, y = argY^2 -1),
TRUE ~ tibble(x = argX^2, y = argY^2)
)
giving:
# A tibble: 3 x 4
argX argY x y
<int> <int> <dbl> <dbl>
1 1 7 1 49
2 2 6 3 35
3 4 4 NA NA
mutate_cond and mutate_when
These are not quite the same as row_case_when since they don't run through conditions taking the first true one but by using mutually exclusive conditions they can be used for certain aspects of this problem. They do not handle changing the number of rows in the result but we can use dplyr::filter to remove rows for a particular condition.
mutate_cond defined in dplyr mutate/replace several columns on a subset of rows is like mutate except the second argument is a condition and the subsequent arguments are applied only to rows for which that condition is TRUE.
mutate_when defined in
dplyr mutate/replace several columns on a subset of rows is similar to case_when except it applies to rows, the replacement values are provided in a list and alternate arguments are conditions and lists. Also all legs are always run applying the replacement values to the rows satisfying the conditions (as opposed to, for each row, performing the replacement on just the first true leg). To get a similar effect to row_case_when be sure that the conditions are mutually exclusive.
# mutate_cond example
tibTest %>%
filter(argX != 3) %>%
mutate(x = NA_integer_, y = NA_integer_) %>%
mutate_cond(argX == 2, x = argX^2 - 1L, y = argY^2 - 1L) %>%
mutate_cond(argX < 2, x = argX^2, y = argY^2)
# mutate_when example
tibTest %>%
filter(argX != 3) %>%
mutate_when(TRUE, list(x = NA_integer_, y = NA_integer_),
argX == 2, list(x = argX^2 - 1L, y = argY^2 - 1L),
argX < 2, list(x = argX^2, y = argY^2))
You need to ensure you are creating a 1-row tibble with each call of the function, then vectorize that.
This works whether you have rowwise groups or not.
You can do this with switch wrapped in a map2:
Here's a reprex:
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
map2(xx, yy, function(x, y){
switch(which(c(x >= 4,
x == 3,
x == 2,
x < 4 & x != 3 & x != 2)),
tibble(x = NA, y = NA),
tibble(x = as.integer(), y = as.integer()),
tibble(x = x^2 - 1, y = y^2 -1),
tibble(x = x^2, y = y^2))})
}
tibTest %>% mutate(sq = square_it(argX, argY)) %>% unnest(cols = sq)
#> # A tibble: 3 x 4
#> argX argY x y
#> <int> <int> <dbl> <dbl>
#> 1 1 7 1 49
#> 2 2 6 3 35
#> 3 4 4 NA NA
Created on 2020-05-16 by the reprex package (v0.3.0)

Resources