Use apply functions within %>% - r

Below I create a function that deletes a specific column if there is only one unique value in it. Can I somehow use lapply within %>% to avoid calling the function three times? Or even call the function for all columns?
df <- tibble(col1 = sample(1:6), col2 = sample(1:6), col3 = 3, col4 = 4)
condDelCol <- function(mycolumn, mydataframe) {
if(length(unique(mydataframe[[mycolumn]])) == 1) { mydataframe[[mycolumn]] = NULL }
mydataframe
}
df %>%
condDelCol("col2", .) %>%
condDelCol("col3", .) %>%
condDelCol("col4", .)

With dplyr, an option is select_if
library(dplyr)
df %>%
select_if(~ n_distinct(.) > 1)
# A tibble: 6 x 2
# col1 col2
# <int> <int>
#1 1 6
#2 6 1
#3 5 5
#4 3 4
#5 4 2
#6 2 3
Or another way is base R by looping over the columns with sapply, create a logical vector, extract the column names that have only single unique value and assign (<-) it to NULL
i1 <- sapply(df, function(x) length(unique(x)))
df[names(which(i1 == 1))] <- NULL
Or with Filter
Filter(var, df)

You could use this one as well. It ignores the columns for which the standard deviation is 0.
df[, sapply(df, sd) != 0]
# A tibble: 6 x 2
col1 col2
<int> <int>
1 1 3
2 5 6
3 6 1
4 2 2
5 3 4
6 4 5
or if you want to use the pipe operator
df %>%
select(which(sapply(df, sd) != 0))

Related

Select specific/all columns in rowwise

I have the following table:
col1
col2
col3
col4
1
2
1
4
5
6
6
3
My goal is to find the max value per each row, and then find how many times it was repeated in the same row.
The resulting table should look like this:
col1
col2
col3
col4
max_val
repetition
1
2
1
4
4
1
5
6
6
3
6
2
Now to achieve this, I am doing the following for Max:
df%>% rowwise%>%
mutate(max=max(col1:col4))
However, I am struggling to find the repetition. My idea is to use this pseudo code in mutate:
sum( "select current row entirely or only for some columns"==max). But I don't know how to select entire row or only some columns of it and use its content to do the check, i.e.: is it equal to the max. How can we do this in dplyr?
A dplyr approach:
library(dplyr)
df %>%
rowwise() %>%
mutate(max_val = max(across(everything())),
repetition = sum(across(col1:col4) == max_val))
# A tibble: 2 × 6
# Rowwise:
col1 col2 col3 col4 max_val repetition
<int> <int> <int> <int> <int> <int>
1 1 2 1 4 4 1
2 5 6 6 3 6 2
An R base approach:
df$max_val <- apply(df,1,max)
df$repetition <- rowSums(df[, 1:4] == df[, 5])
For other (non-tidyverse) readers, a base R approach could be:
df$max_val <- apply(df, 1, max)
df$repetition <- apply(df, 1, function(x) sum(x[1:4] == x[5]))
Output:
# col1 col2 col3 col4 max_val repetition
# 1 1 2 1 4 4 1
# 2 5 6 6 3 6 2
Although dplyr has added many tools for working across rows of data, it remains, in my mind at least, much easier to adhere to tidy principles and always convert the data to "long" format for these kinds of operations.
Thus, here is a tidy approach:
df %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row) %>%
group_by(row) %>%
mutate(max_val = max(value), repetitions = sum(value == max(value))) %>%
pivot_wider(id_cols = c(row, max_val, repetitions)) %>%
select(col1:col4, max_val, repetitions)
The last select() is just to get the columns in the order you want.

Conditional operations in each group

I have some groups of data and in each group there is one number that is a multiple of 7.
For each group, I want to subtract the first value from that multiple.
Reproducible example:
temp.df <- data.frame("temp" = c(48:55, 70:72, 93:99))
temp.df$group <- cumsum(c(TRUE, diff(temp.df$temp) > 1))
Expected result:
group 1: 49-48 = 1
group 2: 70-70 = 0
group 3: 98-93 = 5
Can you suggest me a way that do not require using any loop?
You can get the number divisible by 7 in each group and subtract it with first value.
This can be done in base R using aggregate.
aggregate(temp~group, temp.df, function(x) x[x %% 7 == 0] - x[1])
# group temp
#1 1 1
#2 2 0
#3 3 5
You can also do this using dplyr
library(dplyr)
temp.df %>%
group_by(group) %>%
summarise(temp = temp[temp %% 7 == 0] - first(temp))
and data.table
library(data.table)
setDT(temp.df)[, .(temp = temp[temp %% 7 == 0] - first(temp)), group]
We can also do
library(dplyr)
temp.df %>%
group_by(group) %>%
summarise(temp = temp[which.max(!temp %% 7)] - first(temp))
# A tibble: 3 x 2
# group temp
# <int> <int>
#1 1 1
#2 2 0
#3 3 5

Filtering rows based on two conditions at the ID level

I have long data where a given subject has 4 observations. I want to only include a given id that meets the following conditions:
has at least one 3
has at least one of 1,2 OR NA
My data structure:
df <- data.frame(id=c(1,1,1,1,2,2,2,2,3,3,3,3), a=c(NA,1,2,3, NA,3,2,0, NA,NA,1,1))
My unsuccessful attempt (I get an empty data frame):
df %>% dplyr::group_by(id) %>% filter(a==3 & a %in% c(1,2,NA))
An option is to group by 'id', create a logic to return single TRUE/FALSE as output. Based on the OP's post, we need both values '3' and either one of the values 1, 2, NA in the column 'a'. So, 3 %in% a returns a logical vector of length 1, then wrap any on the second set where we do a comparison with multiple values or check the NA elements (is.na), merge both logical output with &
library(dplyr)
df %>%
group_by(id) %>%
filter((3 %in% a) & any(c(1, 2) %in% a|is.na(a)) )
# A tibble: 8 x 2
# Groups: id [2]
# id a
# <dbl> <dbl>
#1 1 NA
#2 1 1
#3 1 2
#4 1 3
#5 2 NA
#6 2 3
#7 2 2
#8 2 0
I have done this a bit of a long way to show how an idea could work. You can consolidate this a bit.
df %>%
group_by(id) %>%
mutate(has_3 = sum(a == 3, na.rm = T) > 0,
keep_me = has_3 & (sum(is.na(a)) > 0 | sum(a %in% c(1, 2)) > 0)) %>%
filter(keep_me == TRUE) %>%
select(id, a)
id a
<dbl> <dbl>
1 1 NA
2 1 1
3 1 2
4 1 3
5 2 NA
6 2 3
7 2 2
8 2 0
As I read it, the filter should keep ids 1 and 2. So I would use combo of all/any:
df %>%
group_by(id) %>%
filter(all(3 %in% a) & any(c(1,2,NA) %in% a))

mutate_at does not create variable suffixes in some cases?

I have been playing with dplyr::mutate_at to create new variables by applying the same function to some of the columns. When I name my function in the .funs argument, the mutate call creates new columns with a suffix instead of replacing the existing ones, which is a cool option that I discovered in this thread.
df = data.frame(var1=1:2, var2=4:5, other=9)
df %>% mutate_at(vars(contains("var")), .funs=funs('sqrt'=sqrt))
#### var1 var2 other var1_sqrt var2_sqrt
#### 1 1 4 9 1.000000 2.000000
#### 2 2 5 9 1.414214 2.236068
However, I noticed that when the vars argument used to point my columns returns only one column instead of several, the resulting new column drops the initial name: it gets named sqrt instead of other_sqrt here:
df %>% mutate_at(vars(contains("other")), .funs=funs('sqrt'=sqrt))
#### var1 var2 other sqrt
#### 1 1 4 9 3
#### 2 2 5 9 3
I would like to understand why this behaviour happens, and how to avoid it because I don't know in advance how many columns the contains() will return.
EDIT:
The newly created columns must inherit the original name of the original columns, plus the suffix 'sqrt' at the end.
Thanks
Here is another idea. We can add setNames(sub("^sqrt$", "other_sqrt", names(.))) after the mutate_at call. The idea is to replace the column name sqrt with other_sqrt. The pattern ^sqrt$ should only match the derived column sqrt if there is only one column named other, which is demonstrated in Example 1. If there are more than one columns with other, such as Example 2, the setNames would not change the column names.
library(dplyr)
# Example 1
df <- data.frame(var1 = 1:2, var2 = 4:5, other = 9)
df %>%
mutate_at(vars(contains("other")), funs("sqrt" = sqrt(.))) %>%
setNames(sub("^sqrt$", "other_sqrt", names(.)))
# var1 var2 other other_sqrt
# 1 1 4 9 3
# 2 2 5 9 3
# Example 2
df2 <- data.frame(var1 = 1:2, var2 = 4:5, other1 = 9, other2 = 16)
df2 %>%
mutate_at(vars(contains("other")), funs("sqrt" = sqrt(.))) %>%
setNames(sub("^sqrt$", "other_sqrt", names(.)))
# var1 var2 other1 other2 other1_sqrt other2_sqrt
# 1 1 4 9 16 3 4
# 2 2 5 9 16 3 4
Or we can design a function to check how many columns contain the string other before manipulating the data frame.
mutate_sqrt <- function(df, string){
string_col <- grep(string, names(df), value = TRUE)
df2 <- df %>% mutate_at(vars(contains(string)), funs("sqrt" = sqrt(.)))
if (length(string_col) == 1){
df2 <- df2 %>% setNames(sub("^sqrt$", paste(string_col, "sqrt", sep = "_"), names(.)))
}
return(df2)
}
mutate_sqrt(df, "other")
# var1 var2 other other_sqrt
# 1 1 4 9 3
# 2 2 5 9 3
mutate_sqrt(df2, "other")
# var1 var2 other1 other2 other1_sqrt other2_sqrt
# 1 1 4 9 16 3 4
# 2 2 5 9 16 3 4
I just figured out a (not so clean) way to do it;
I add a extra dummy variable to the dataset, with a name that ensures that it will be selected and that we don't fall into the 1-variable case, and after the calculation I remove the 2 dummies, like this:
df %>% mutate(other_fake=NA) %>%
mutate_at(vars(contains("other")), .funs=funs('sqrt'=sqrt)) %>%
select(-contains("other_fake"))
#### var1 var2 other other_sqrt
#### 1 1 4 9 3
#### 2 2 5 9 3

How to mutate() a list of columns using map2() in dplyr

I recently had to compile a data frame of student scores (one row per student, id column and several integer-valued columns, one per score component). I had to combine a "master" data frame and several "correction" data frames (containing mostly NA and some updates to the master), so that the result contains the maximum values from the master, and all corrections.
I succeeded by copy-pasting a sequence of mutate() calls, which works (see example below), but is not elegant in my opinion. What I would have wanted to do, was instead of copying and pasting, to use something along the lines of map2 and two lists of columns to compare the columns pair-wise. Something like (which obviously does not work as such):
list_of_cols1 <- list(col1.x, col2.x, col3.x)
list_of_cols2 <- list(col1.y, col2.y, col3.y
map2(list_of_cols1, list_of_cols2, ~ column = pmax(.x, .y, na.rm=T))
I can't seem to be able to figure out to do it. My question is: how to specify such lists of columns and mutate them in one map2() call in dplyr pipe, or is it even possible – have I gotten it all wrong?
Minimum working example
library(tidyverse)
master <- tibble(
id=c(1,2,3),
col1=c(1,1,1),
col2=c(2,2,2),
col3=c(3,3,3)
)
correction1 <- tibble(
id=seq(1,3),
col1=c(NA, NA, 2 ),
col2=c( 1, NA, 3 ),
col3=c(NA, NA, NA)
)
result <- reduce(
# Ultimately there would several correction data frames
list(master, correction1),
function(x,y) {
x <- x %>%
left_join(
y,
by = c("id")
) %>%
# Wish I knew how to do this mutate call with map2
mutate(
col1 = pmax(col1.x, col1.y, na.rm=T),
col2 = pmax(col2.x, col2.y, na.rm=T),
col3 = pmax(col3.x, col3.y, na.rm=T)
) %>%
select(id, col1:col3)
}
)
The result is
> result
# A tibble: 3 x 4
id col1 col2 col3
<int> <dbl> <dbl> <dbl>
1 1 1 2 3
2 2 1 2 3
3 3 2 3 3
Rather than do a left_join, just bind the rows then summarize. For example
result <- reduce(
list(master, master),
function(x,y) {
bind_rows(x, y) %>%
group_by(id) %>%
summarize_all(max, na.rm=T)
}
)
result
# id col1 col2 col3
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2 3
# 2 2 1 2 3
# 3 3 2 3 3
Actually, you don't even need reduce as bind_rows can take a list
Adding another table
correction2 <- tibble(id=2,col1=NA,col2=8,col3=NA)
bind_rows(master, correction1, correction2) %>%
group_by(id) %>%
summarize_all(max, na.rm=T)
Sorry this doesn't answer your question about map2, I find it's easier to aggregate over rows than it is over columns in tidy R:
library(dplyr)
master <- tibble(
id=c(1,2,3),
col1=c(1,1,1),
col2=c(2,2,2),
col3=c(3,3,3)
)
correction1 <- tibble(
id=seq(1,3),
col1=c(NA, NA, 2 ),
col2=c( 1, NA, 3 ),
col3=c(NA, NA, NA)
)
result <- list(master, correction1) %>%
bind_rows() %>%
group_by(id) %>%
summarise_all(max, na.rm = TRUE)
result
#> # A tibble: 3 x 4
#> id col1 col2 col3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 3
#> 2 2 1 2 3
#> 3 3 2 3 3
If correction tables will always have the same structure as master, you can do something like the following:
library(dplyr)
library(purrr)
update_master = function(...){
map(list(...), as.matrix) %>%
reduce(pmax, na.rm = TRUE) %>%
data.frame()
}
update_master(master, correction1)
To allow id to take character values, make the following modification:
update_master = function(x, ...){
map(list(x, ...), function(x) as.matrix(x[-1])) %>%
reduce(pmax, na.rm = TRUE) %>%
data.frame(id = x[[1]], .)
}
update_master(master, correction1)
Result:
id col1 col2 col3
1 1 1 2 3
2 2 1 2 3
3 3 2 3 3

Resources