User Defined Function with Case When Reference - r

I'm trying to define a function that references a value in another cell according to its row (if that makes sense). Some generalized data would look like this:
col1 col2 col3
A 1 B
B 2 C
C 6 NA
My goal is the below, where "calc" is a sum of col2 from the respective row and the col2 value from the referenced row
col1 col2 calc
A 1 3
B 2 8
C 6 6
letters <- c("A","B","C")
calc<- case_when(data$col1 == "A" ~ subset(data$col2, data$col3 == "A"),
data$col1 == "B" ~ subset(data$col2, data$col3 == "B"),
data$col1 == "C" ~ subset(data$col2, data$col3 == "C"))
totals<- data.frame(cbind(data$col2,calc)) %>% rowSums(., na.rm = TRUE)
data.frame(cbind(data$col1,totals))
I'm not sure how to turn this into a function -- I tried the below,
udfunction<- function(x){
calc<- case_when(data$col1 == x ~ subset(data$col2, data$col3 == x))
totals<- data.frame(cbind(data$col1,data$col2,calc)) %>% rowSums(., na.rm = TRUE)
data.frame(cbind(data$col1,totals))
}
udfunction(letters)
then got the error:
In col1 == x : longer object length is not a multiple of shorter object length
udfunction(letters)
Any help would be very appreciated!

transform(df, calc = col2 + c(0, col2)[match(col3, col1, 0) + 1])
col1 col2 col3 calc
1 A 1 B 3
2 B 2 C 8
3 C 6 <NA> 6

Related

If values in two columns satisfy two different conditions, store corresponding value of third column into a list in R

I have a data frame that looks like this:
col1 col2 col3 y
1 2 2 10
0 1 0 15
2 2 1 17
1 2 1 9
2 0 0 8
0 1 2 21
I want to store the value in y in a list if col1 and col2 meet two separate conditions.
In this example my conditions are if if the value in col1 == 0 and the value in col2 == 1, then store the value of y in the list.
col3 is completely ignored. I just added it in here because my actual dataframe has many columns that I am not interested in.
So, in this example, the result would be a list of length 2 with the values "15" and "21" inside.
library(dplyr)
data <-
data.frame(
col1 = c(1,0,2,0),
col2 = c(2,1,2,1),
y = c(10,15,17,21)
)
data %>%
filter(col1 == 0 & col2 == 1) %>%
pull(y) %>%
as.list()
Here's an example from yours without using dplyr. You can use the subset function.
# Create a sample data frame
df = data.frame(col1 = c(1,0,2,1,2,0),
col2 = c(2,1,2,2,0,1),
col3 = c(2,0,1,1,0,2),
y = c(10,15,17,9,8,21))
# Use the subset() function to extract rows where col1 == 0 and col2 == 1
filtered_df = subset(df, col1 == 0 & col2 == 1)
# Extract the values of column "y" from the filtered data frame
y_values = filtered_df$y
# Print the y_values
print(y_values)
You can also use the [ operator
# Use the [ operator to extract rows where col1 == 0 and col2 == 1
filtered_df = df[df$col1 == 0 & df$col2 == 1,]
You can use select(y) + as.list(). The pull method will give you a list for each value of y. If you want a single list that has all the values, you can use select.
data <-
data.frame(
col1 = c(1,0,2,0),
col2 = c(2,1,2,1),
y = c(10,15,17,21)
)
data %>%
filter(col1 == 0 & col2 == 1) %>%
select(y)%>% as.list()

how to simplify repetitive mutate conditions

I have an example df:
df <- data.frame(
col1 = c(4,5,6,11),
col2 = c('b','b','c', 'b')
)
> df
col1 col2
1 4 b
2 5 b
3 6 c
4 11 b
and I mutate based on these conditions:
df2 <- df %>%
mutate(col3 = case_when(
col2 == 'b' & col1 == 4 ~ 10,
col2 == 'b' & col1 == 5 ~ 15,
col2 == 'b' & col1 == 11 ~ 20,
col2 == 'c' & col1 == 6 ~ 7)
)
> df2
col1 col2 col3
1 4 b 10
2 5 b 15
3 6 c 7
4 11 b 20
You can see that the first 3 conditions are repetitive in that they require col2 == 'b'. Is there some syntax or another package or more efficient way of combining same/similar conditions so that I don't need to repeat col2 == 'b'? Like a one liner that if col2 == 'b' then do these transformations.
You can write nested case_when
df %>% mutate(col3 = case_when(
col2=="b" ~ case_when(
col1 == 4 ~ 10,
col1 == 5 ~ 15,
col1 == 11 ~ 20),
col2=="c" ~ case_when(
col1 == 6 ~ 7)))
Another solution colud be using a auxiliary table, this way however will limit the flexibility of case_when and only works for equality matches but i suspect is a lot faster.
library(dplyr)
df <- data.frame(
col1 = c(4,5,6,11),
col2 = c('b','b','c', 'b')
)
choices <- data.frame(
col2 = c(rep("b",3),"c"),
col1 = c(4,5,11,6),
col3 = c(10,15,20,7)
)
df %>% left_join(choices, by = c("col1"="col1", "col2"="col2"))
#> col1 col2 col3
#> 1 4 b 10
#> 2 5 b 15
#> 3 6 c 7
#> 4 11 b 20
Be sure to capture the default unmatched cases.

dplyr ifelse mutate reference to variable outside the data frame

I have a simple problem but i haven't figured out the solution yet. I don't know how to reference to a variable outside the data frame when I'm using dplyr. Here is a small chunk of code:
library(dplyr)
var <- 1
df <- data.frame(col1 = c("a", "b", "c"), col2 = c(1, 2, 3))
df %>% mutate(col2 = ifelse(var == 1, col2 + var, col2))
Result:
col1 col2
1 a 2
2 b 2
3 c 2
Desired output:
col1 col2
1 a 2
2 b 3
3 c 4
This is not a dplyr specific issue but when you have a condition to check of length 1 use if and else instead of vectorized ifelse.
library(dplyr)
df %>% mutate(col2 = if(var == 1) col2 + var else col2)
# col1 col2
#1 a 2
#2 b 3
#3 c 4
We could use rowwise and sum
df %>%
rowwise() %>%
mutate(col2 = ifelse(var == 1, sum(col2,var), col2))
col1 col2
<chr> <dbl>
1 a 2
2 b 3
3 c 4
We could use base R for this
i1 <- df$col2 == var
df$col2[i1] <- df$col2[i1] + var
-output
> df
col1 col2
1 a 2
2 b 2
3 c 3
Or use data.table
library(data.table)
setDT(df)[col2 == var, col2 := col2 + var]

Declaring variables inside mutate

I am trying to declare the variables inside mutate using all_of but not getting proper output
asd <- data.frame(Col1 = c("A","B"), Col2 = c("R","E"))
a1 <- "Col1"
When I perform below operations, I get invalid output
asd %>% mutate(q1 = case_when(all_of(a1) == "A" ~ 1))
Col1 Col2 a1
1 A R NA
2 B E NA
Expected Output
asd %>% mutate(q1 = case_when(Col1 == "A" ~ 1))
Col1 Col2 q1
1 A R 1
2 B E NA
Or we could use glue::glue just bear in mind that whatever you put inside curly braces will be evaluate as R code:
library(glue)
asd %>%
mutate(q1 = case_when(
eval(parse(text = glue("{a1}"))) == "A" ~ 1
))
Col1 Col2 q1
1 A R 1
2 B E NA
Wrap it in get()
R> asd %>% mutate(q1 = case_when(all_of(get(a1)) == "A" ~ 1))
Col1 Col2 q1
1 A R 1
2 B E NA
We could use across
library(dplyr)
library(stringr)
asd %>%
mutate(across(all_of(a1), ~ case_when(. == 'A' ~ 1),
.names = "{str_replace(.col, '.*', 'q1')}"))
Col1 Col2 q1
1 A R 1
2 B E NA

Only Keep Certain Combinations of Predictors in a Dataframe

Imagine that I have a data frame like this:
> col1 <- rep(1:3,10)
> col2 <- rep(c("a","b"),15)
> col3 <- rnorm(30,10,2)
> sample_df <- data.frame(col1 = col1, col2 = col2, col3 = col3)
> head(sample_df)
col1 col2 col3
1 1 a 13.460322
2 2 b 3.404398
3 3 a 8.952066
4 1 b 11.148271
5 2 a 9.808366
6 3 b 9.832299
I only want to keep combinations of predictors which, together, have a col3 standard deviation below 2. I can find the combinations using ddply, but I don't know how to backtrack to the original DF and select the correct levels.
> sample_df_summ <- ddply(sample_df, .(col1, col2), summarize, sd = sd(col3), count = length(col3))
> head(sample_df_summ)
col1 col2 sd count
1 1 a 2.702328 5
2 1 b 1.032371 5
3 2 a 2.134151 5
4 2 b 3.348726 5
5 3 a 2.444884 5
6 3 b 1.409477 5
For clarity, in this example, I'd like the DF with col1 = 3, col2 = b and col1 = 1 and col 2 = b. How would I do this?
You can add a "keep" column that is TRUE only if the standard deviation is below 2. Then, you can use a left join (merge) to add the "keep" column to the initial dataframe. In the end, you just select with keep equal to TRUE.
# add the keep column
sample_df_summ$keep <- sample_df_summ$sd < 2
sample_df_summ$sd <- NULL
sample_df_summ$count <- NULL
# join and select the rows
sample_df_keep <- merge(sample_df, sample_df_summ, by = c("col1", "col2"), all.x = TRUE, all.y = FALSE)
sample_df_keep <- sample_df_keep[sample_df_keep$keep, ]
sample_df_keep$keep <- NULL
Using dplyr:
library(dplyr)
sample_df %>% group_by(col1, col2) %>% mutate(sd = sd(col3)) %>% filter(sd < 2)
You get:
#Source: local data frame [6 x 4]
#Groups: col1, col2
#
# col1 col2 col3 sd
#1 1 a 10.516437 1.4984853
#2 1 b 11.124843 0.8652206
#3 2 a 7.585740 1.8781241
#4 3 b 9.806124 1.6644076
#5 1 a 7.381209 1.4984853
#6 1 b 9.033093 0.8652206

Resources