Use na_if function in R with negative condition - r

I would like to replace NAs in a data frame using na_if in column Value conditonal on column Category. But instead of condition used below I would like to replace it in case it is not equal to "cat_1".
data_B <- data_A %>%
mutate(Value = na_if(Category, "cat_1"))
Can it be modified? Equality operators do not seem to work.
Note: na_if function keeps original values in a column whilst replacing part of them with NAs (it does not substitute Category values in the Value column in this example)

I don't think it is directly possible with na_if, but you can use replace + != instead, or case_when + ==:
library(dplyr)
data.frame(Category = paste0("cat_", 1:4)) %>%
mutate(Value = replace(Category, Category != "cat_1", NA),
Value2 = case_when(Category == "cat_1" ~ Category))
output
Category Value Value2
1 cat_1 cat_1 cat_1
2 cat_2 <NA> <NA>
3 cat_3 <NA> <NA>
4 cat_4 <NA> <NA>

If your variable is a factor or your willing to convert:
df <- df |>
mutate(
Value = factor(df$Category, levels = "cat_1"),
Value2 = as.character(Value) # Converting factor to character
)
# 'data.frame': 4 obs. of 3 variables:
# $ Category: Factor w/ 4 levels "cat_1","cat_2",..: 1 2 3 4
# $ Value : Factor w/ 1 level "cat_1": 1 NA NA NA
# $ Value2 : chr "cat_1" NA NA NA
# Category Value Value2
# 1 cat_1 cat_1 cat_1
# 2 cat_2 <NA> <NA>
# 3 cat_3 <NA> <NA>
# 4 cat_4 <NA> <NA>
Data:
df = data.frame(Category = factor(paste0("cat_", 1:4)))

In my opinion Maël's answer is the easiest solution, but another potential option is to create your own function; looking at the source code for na_if() you could Negate() the vec_equal() to create your own na_if_not() function and still retain the utility and behaviour of na_if(), i.e.
Simple example:
library(tidyverse)
library(vctrs)
na_if_not <- function(x, y) {
y <- vec_cast(x = y, to = x, x_arg = "y", to_arg = "x")
y <- vec_recycle(y, size = vec_size(x), x_arg = "y")
na <- vec_init(x)
vec_not_equal <- Negate(vec_equal)
where <- vec_not_equal(x, y, na_equal = TRUE)
x <- vec_assign(x, where, na)
x
}
df <- data.frame(Category = paste0("cat_", 1:4),
Value = paste0("cat_", 1:4),
Value2 = paste0("cat_", 1:4))
df %>%
mutate(Value = na_if_not(Value, "cat_1"),
Value2 = na_if_not(Category, "cat_1"))
#> Category Value Value2
#> 1 cat_1 cat_1 cat_1
#> 2 cat_2 <NA> <NA>
#> 3 cat_3 <NA> <NA>
#> 4 cat_4 <NA> <NA>
Created on 2022-09-30 by the reprex package (v2.0.1)
Replacing "setosa's" (na_if()) and "everything-but-setosa's" (na_if_not()) in place:
library(tidyverse)
library(vctrs)
na_if_not <- function(x, y) {
y <- vec_cast(x = y, to = x, x_arg = "y", to_arg = "x")
y <- vec_recycle(y, size = vec_size(x), x_arg = "y")
na <- vec_init(x)
vec_not_equal <- Negate(vec_equal)
where <- vec_not_equal(x, y, na_equal = TRUE)
x <- vec_assign(x, where, na)
x
}
# na_if() example
iris %>%
head() %>%
mutate(Species = c("Setosa", "virginica", "versicolor",
"Setosa", "virginica", "versicolor")) %>%
mutate(Species = na_if(Species, "Setosa"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
#> 2 4.9 3.0 1.4 0.2 virginica
#> 3 4.7 3.2 1.3 0.2 versicolor
#> 4 4.6 3.1 1.5 0.2 <NA>
#> 5 5.0 3.6 1.4 0.2 virginica
#> 6 5.4 3.9 1.7 0.4 versicolor
# na_if_not() example
iris %>%
head() %>%
mutate(Species = c("Setosa", "virginica", "versicolor",
"Setosa", "virginica", "versicolor")) %>%
mutate(Species = na_if_not(Species, "Setosa"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 Setosa
#> 2 4.9 3.0 1.4 0.2 <NA>
#> 3 4.7 3.2 1.3 0.2 <NA>
#> 4 4.6 3.1 1.5 0.2 Setosa
#> 5 5.0 3.6 1.4 0.2 <NA>
#> 6 5.4 3.9 1.7 0.4 <NA>
Created on 2022-09-30 by the reprex package (v2.0.1)

Related

for loop to change columns with a specified unique length to factor in multiple dataframes

I have several dataframes for which I need to fix the classes of multiple columns, before I can proceed. Because the dataframes all have the same variables but the classes seemed to differ from one dataframe to the other, I figured I would go for a 'for loop'and specify the unique length upon which a column should be coded as factor or numeric.
I tried the following for factor:
dataframes <- list(dataframe1, dataframe2, dataframe2, dataframe3)
for (i in dataframes){
cols.to.factor <-sapply(i, function(col) length(unique(col)) < 6)
i[cols.to.factor] <- apply(i[cols.to.factor] , factor)
}
now the code runs, but it doesn't change anything. What am I missing?
Thanks for the help in advance!
The instruction
for(i in dataframes)
extracts i from the list dataframes and the loop changes the copy, that is never reassigned to the original. A way to correct the problem is
for (i in seq_along(dataframes)){
x <- dataframes[[i]]
cols.to.factor <-sapply(x, function(col) length(unique(col)) < 6)
x[cols.to.factor] <- lapply(x[cols.to.factor] , factor)
dataframes[[i]] <- x
}
An equivalent lapply based solution is
dataframes <- lapply(dataframes, \(x){
cols.to.factor <- sapply(x, function(col) length(unique(col)) < 6)
x[cols.to.factor] <- lapply(x[cols.to.factor], factor)
x
})
library(tidyverse)
# example data
list(
iris,
iris %>% mutate(Sepal.Length = Sepal.Length %>% as.character())
) %>%
# unify column classes
map(~ .x %>% mutate(across(everything(), as.character))) %>%
# optional joining if wished
bind_rows() %>%
mutate(Species = Species %>% as.factor()) %>%
as_tibble()
#> # A tibble: 300 x 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <chr> <chr> <chr> <chr> <fct>
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 5 3.4 1.5 0.2 setosa
#> 9 4.4 2.9 1.4 0.2 setosa
#> 10 4.9 3.1 1.5 0.1 setosa
#> # … with 290 more rows
Created on 2021-10-05 by the reprex package (v2.0.1)

How to replace data in current columns using mutate?

I want to group my dataframe by year and standardize certain columns (In this case BioTest, MathExam, and WritingScore) and replace the old data with the new data.Below is an example of my dataframe:
DF:
Var1 Var2 Year BioTest MathExam WritingScore Var3 Var 4
X X 2016 165 140 10 X X
X X 2017 172 128 11 X X
X X 2018 169 115 8 X X
X X 2016 166 139 10 X X
X X 2017 165 140 12 X X
I have tried variations of the following code:
DF<- DF %>% group_by(Year)%>% mutate(across(BioTest:WritingScore),scale)
DF<- DF %>% group_by(Year)%>% mutate(across(select(BioTest:WritingScore)),scale)
What I get in return is the same DF without any changes. What I want is:
DF:
Var1 Var2 Year BioTest MathExam WritingScore Var3 Var 4
X X 2016 NewData NewData NewData X X
X X 2017 NewData NewData NewData X X
X X 2018 NewData NewData NewData X X
X X 2016 NewData NewData NewData X X
X X 2017 NewData NewData NewData X X
Any help is much appreciated.
The issue could be that dplyr::mutate was masked by the plyr::mutate. It can be reproduced with (along with the fact that across is closed without a function)
iris %>%
group_by(Species) %>%
plyr::mutate(across(where(is.numeric), scale))
# A tibble: 150 x 5
# Groups: Species [3]
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3 1.4 0.2 setosa
# 3 4.7 3.2 1.3 0.2 setosa
# 4 4.6 3.1 1.5 0.2 setosa
# 5 5 3.6 1.4 0.2 setosa
# 6 5.4 3.9 1.7 0.4 setosa
# 7 4.6 3.4 1.4 0.3 setosa
# 8 5 3.4 1.5 0.2 setosa
# 9 4.4 2.9 1.4 0.2 setosa
#10 4.9 3.1 1.5 0.1 setosa
# … with 140 more rows
which is the same as the initial 'iris' dataset
Now, check with the correct dplyr::mutate
iris %>%
group_by(Species) %>%
dplyr::mutate(across(where(is.numeric), scale))
# A tibble: 150 x 5
# Groups: Species [3]
# Sepal.Length[,1] Sepal.Width[,1] Petal.Length[,1] Petal.Width[,1] Species
# <dbl> <dbl> <dbl> <dbl> <fct>
# 1 0.267 0.190 -0.357 -0.436 setosa
# 2 -0.301 -1.13 -0.357 -0.436 setosa
# 3 -0.868 -0.601 -0.933 -0.436 setosa
# 4 -1.15 -0.865 0.219 -0.436 setosa
# 5 -0.0170 0.454 -0.357 -0.436 setosa
# 6 1.12 1.25 1.37 1.46 setosa
# 7 -1.15 -0.0739 -0.357 0.512 setosa
# 8 -0.0170 -0.0739 0.219 -0.436 setosa
# 9 -1.72 -1.39 -0.357 -0.436 setosa
#10 -0.301 -0.865 0.219 -1.39 setosa
# … with 140 more rows
So, in the OP's code, we just need to use dplyr::mutate or restart a fresh R session with only dplyr loaded
DF %>%
group_by(Year)%>%
dplyr::mutate(across(BioTest:WritingScore, scale))
scale returns a matrix with some attributes. If we only need the numeric vector part, we can either use as.vector or as.numeric
DF %>%
group_by(Year)%>%
dplyr::mutate(across(BioTest:WritingScore, ~ as.numeric(scale(.)))
NOTE: The select is not needed within across
Maybe try this. THe issue is on your across() statement. The function must be inside on it:
library(dplyr)
#Code
DF %>%
group_by(Year) %>%
mutate(across(BioTest:WritingScore,~scale(.)[,1]))
Output:
# A tibble: 5 x 9
# Groups: Year [3]
Var1 Var2 Year BioTest[,1] MathExam[,1] WritingScore[,1] Var3 Var X4
<chr> <chr> <int> <dbl> <dbl> <dbl> <chr> <chr> <lgl>
1 X X 2016 -0.707 0.707 NaN X X NA
2 X X 2017 0.707 -0.707 -0.707 X X NA
3 X X 2018 NaN NaN NaN X X NA
4 X X 2016 0.707 -0.707 NaN X X NA
5 X X 2017 -0.707 0.707 0.707 X X NA
Some data used:
#Data
DF <- structure(list(Var1 = c("X", "X", "X", "X", "X"), Var2 = c("X",
"X", "X", "X", "X"), Year = c(2016L, 2017L, 2018L, 2016L, 2017L
), BioTest = c(165L, 172L, 169L, 166L, 165L), MathExam = c(140L,
128L, 115L, 139L, 140L), WritingScore = c(10L, 11L, 8L, 10L,
12L), Var3 = c("X", "X", "X", "X", "X"), Var = c("X", "X", "X",
"X", "X"), X4 = c(NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-5L))

How to apply all objects of a list to a function which mutates a dplyr tibble?

I have a data set of variables which I want to perform some standard mutations to (e.g. sum, divide). I have a list which specifies which variables I want to be divided by which.
I want to know if it's possibly to apply a custom mutate function to all of the names in the list, iteratively mutating the tibble (i.e. not creating a new tibble like lapply would).
For example below, I want Sepal.Width and Petal.Length to be scaled by (divide) by Sepal.Length, and similar for Sepal.Width.
scale_variables_by = list(Sepal.Length = c("Sepal.Width", "Petal.Length"),
Sepal.Width = c("Petal.Width", "Petal.Length"))
To do so, I have two functions.
The first scales one variable by another and renames the new column as "original"_"div"_"scalar":
# Scales one variable by another
scale_by <- function(df, variables, scalar, separator = "div") {
scalar_s <- sym(scalar)
df %>%
mutate_at(variables, list(temp = ~if_else(eval(scalar_s) != 0,
./eval(scalar_s),
NA_real_))) %>%
rename_at(vars(ends_with("temp")), ~paste(variables, separator, scalar,
sep = "_"))
}
The second takes a list, accesses all of the variables specified to be scaled by a specific variable, and scales them by that variable.
# Takes a list with specified variable/scalar combinations and applies scale_by
# to them
scale_by_list <- function(df, input_list, scalar_l) {
df %>%
scale_by(variables = input_list[[scalar_l]], scalar = scalar_l)
}
Putting it all together it works like this:
library(dplyr)
# Scales one variable by another
scale_by <- function(df, variables, scalar, separator = "div") {
scalar_s <- sym(scalar)
df %>%
mutate_at(variables, list(temp = ~if_else(eval(scalar_s) != 0,
./eval(scalar_s),
NA_real_))) %>%
rename_at(vars(ends_with("temp")), ~paste(variables, separator, scalar,
sep = "_"))
}
# Takes a list with specified variable/scalar combinations and applies scale_by
# to them
scale_by_list <- function(df, input_list, scalar_l) {
df %>%
scale_by(variables = input_list[[scalar_l]], scalar = scalar_l)
}
scale_variables_by = list(Sepal.Length = c("Sepal.Width", "Petal.Length"),
Sepal.Width = c("Petal.Width", "Petal.Length"))
iris %>%
as_tibble() %>%
scale_by_list(input_list = scale_variables_by, scalar_l = "Sepal.Length") %>%
scale_by_list(input_list = scale_variables_by, scalar_l = "Sepal.Width") %>%
select(Sepal.Width_div_Sepal.Length, everything())
#> # A tibble: 150 x 9
#> Sepal.Width_div~ Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.686 5.1 3.5 1.4 0.2
#> 2 0.612 4.9 3 1.4 0.2
#> 3 0.681 4.7 3.2 1.3 0.2
#> 4 0.674 4.6 3.1 1.5 0.2
#> 5 0.72 5 3.6 1.4 0.2
#> 6 0.722 5.4 3.9 1.7 0.4
#> 7 0.739 4.6 3.4 1.4 0.3
#> 8 0.68 5 3.4 1.5 0.2
#> 9 0.659 4.4 2.9 1.4 0.2
#> 10 0.633 4.9 3.1 1.5 0.1
#> # ... with 140 more rows, and 4 more variables: Species <fct>,
#> # Petal.Length_div_Sepal.Length <dbl>,
#> # Petal.Width_div_Sepal.Width <dbl>, Petal.Length_div_Sepal.Width <dbl>
Created on 2019-08-01 by the reprex package (v0.2.1)
The result is what I want (I have four new columns which have the scaled variables), except I would like to call scale_by_list for all objects in the list, rather than manually creating a call for each one.
lapply can do this however it creates multiple different tibbles in a list, which I suppose could be join but I feel like there's a better way to do it.
The solution I'm leaning towards for pipeability:
scale_by <- function(df, variables, scalar, separator = "div") {
scalar_s <- sym(scalar)
df %>%
mutate_at(variables, list(temp = ~if_else(eval(scalar_s) != 0,
./eval(scalar_s),
NA_real_))) %>%
rename_at(vars(ends_with("temp")), ~paste(variables, separator, scalar,
sep = "_")) %>%
select(paste(variables, separator, scalar, sep = "_"))
}
iris %>%
bind_cols(
names(scale_variables_by) %>%
map_dfc(scale_by_list, df = as_tibble(iris),
input_list = scale_variables_by))
If I have understood you correctly, you have got scale_variables_by list and for every element in the list you want to divide (or any other operation) it by the name of that list. We can do this using base R with Map
iris_df <- iris
new_names <- c(mapply(function(x, y) paste0(x, "_div_", y),
scale_variables_by, names(scale_variables_by)))
iris_df[new_names] <- do.call(cbind, Map(function(x, y)
iris_df[x]/iris_df[rep(y, length(x))], scale_variables_by, names(scale_variables_by)))
head(iris_df)
#Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Width_div_Sepal.Length
#1 5.1 3.5 1.4 0.2 setosa 0.6863
#2 4.9 3.0 1.4 0.2 setosa 0.6122
#3 4.7 3.2 1.3 0.2 setosa 0.6809
#4 4.6 3.1 1.5 0.2 setosa 0.6739
#5 5.0 3.6 1.4 0.2 setosa 0.7200
#6 5.4 3.9 1.7 0.4 setosa 0.7222
# Petal.Length_div_Sepal.Length Petal.Width_div_Sepal.Width Petal.Length_div_Sepal.Width
#1 0.2745 0.05714 0.4000
#2 0.2857 0.06667 0.4667
#3 0.2766 0.06250 0.4062
#4 0.3261 0.06452 0.4839
#5 0.2800 0.05556 0.3889
#6 0.3148 0.10256 0.4359
This gives the same output as your attempt and is also scalable since it will also work if you include more number of names in each list provided those names are present in your dataframe.
If you want to take the tidyverse approach, the code becomes a bit shorter with imap
iris_df[new_names] <- purrr::imap_dfc(scale_variables_by,
~iris_df[.x]/iris_df[rep(.y, length(.x))])
which is same as using map2 in the following way
iris_df[new_names] <- purrr::map2_dfc(scale_variables_by, names(scale_variables_by),
~iris_df[.x]/iris_df[rep(.y, length(.x))])

R dplyr: Mutate with Reduce with init, after group_by

Is is possible to specify an initial value for Reduce without it being added into the dataframe?
For example, with function:
f <- function(x, y) if (y<0) -x * y else x + y
Acting on data frame:
set.seed(0)
df <- c(-0.9, sample(c(-0.9, 1:3), 9, replace = TRUE)) %>% tibble()
names(df) <- "x"
df <- df %>% mutate(id = 'a')
df$id[6:10] <- 'b'
df <- df %>% group_by(id) %>% mutate(sumprod = Reduce(f, x, acc=TRUE)) %>% ungroup()
df$target <- c(0, 3, 4, 5, 7, 3, 2.7, 5.7, 8.7, 10.7)
df
# A tibble: 10 x 4
x id sumprod target
<dbl> <chr> <dbl> <dbl>
1 -0.9 a -0.9 0.0
2 3.0 a 2.1 3.0
3 1.0 a 3.1 4.0
4 1.0 a 4.1 5.0
5 2.0 a 6.1 7.0
6 3.0 b 3.0 3.0
7 -0.9 b 2.7 2.7
8 3.0 b 5.7 5.7
9 3.0 b 8.7 8.7
10 2.0 b 10.7 10.7
The goal is column target. I've tried using init with Reduce, however that adds an extra element.
Reduce(f, df$x[1:5], acc=TRUE, init=0)
[1] 0 0 3 4 5 7
Using this within mutate produces an error:
> df <- df %>% group_by(id) %>% mutate(sumprod = Reduce(f, x, acc=TRUE, init=0)) %>% ungroup()
Error in mutate_impl(.data, dots) :
Column `sumprod` must be length 5 (the group size) or one, not 6
If init is given, Reduce logically adds it to the start (when proceeding left to right) or the end of x, respectively. If you don't need the element, you can use tail(..., -1) to remove the first element:
df %>%
group_by(id) %>%
mutate(sumprod = tail(Reduce(f, x, acc=TRUE, init=0), -1)) %>%
ungroup()
# A tibble: 10 x 4
# x id sumprod target
# <dbl> <chr> <dbl> <dbl>
# 1 -0.9 a 0.0 0.0
# 2 3.0 a 3.0 3.0
# 3 1.0 a 4.0 4.0
# 4 1.0 a 5.0 5.0
# 5 2.0 a 7.0 7.0
# 6 3.0 b 3.0 3.0
# 7 -0.9 b 2.7 2.7
# 8 3.0 b 5.7 5.7
# 9 3.0 b 8.7 8.7
#10 2.0 b 10.7 10.7
With tidyverse, there is accumulate from purrr
library(tidyverse)
df %>%
group_by(id) %>%
mutate(sumprod = accumulate(.x = x, .f = f, .init = 0)[-1]) %>%
ungroup
# A tibble: 10 x 3
# x id sumprod
# <dbl> <chr> <dbl>
# 1 -0.9 a 0.0
# 2 3.0 a 3.0
# 3 1.0 a 4.0
# 4 1.0 a 5.0
# 5 2.0 a 7.0
# 6 3.0 b 3.0
# 7 -0.9 b 2.7
# 8 3.0 b 5.7
# 9 3.0 b 8.7
#10 2.0 b 10.7

dplyr - using mutate() like rowmeans()

I can't find the answer anywhere.
I would like to calculate new variable of data frame which is based on mean of rows.
For example:
data <- data.frame(id=c(101,102,103), a=c(1,2,3), b=c(2,2,2), c=c(3,3,3))
I want to use mutate to make variable d which is mean of a,b and c. And I would like to be able to make that by selecting columns in way d=mean(a,b,c), and also I need to use range of variables (like in dplyr) d=mean(a:c).
And of course
mutate(data, c=mean(a,b))
or
mutate(data, c=rowMeans(a,b))
doesn't work.
Can you give me some tip?
Regards
You're looking for
data %>%
rowwise() %>%
mutate(c=mean(c(a,b)))
# id a b c
# (dbl) (dbl) (dbl) (dbl)
# 1 101 1 2 1.5
# 2 102 2 2 2.0
# 3 103 3 2 2.5
or
library(purrr)
data %>%
rowwise() %>%
mutate(c=lift_vd(mean)(a,b))
dplyr is badly suited to operate on this kind of data because it assumes tidy data format and — for the problem in question — your data is untidy.
You can of course tidy it first:
tidy_data = tidyr::gather(data, name, value, -id)
Which looks like this:
id name value
1 101 a 1
2 102 a 2
3 103 a 3
4 101 b 2
5 102 b 2
6 103 b 2
…
And then:
tidy_data %>% group_by(id) %>% summarize(mean = mean(value))
name mean
(fctr) (dbl)
1 a 2
2 b 2
3 c 3
Of course this discards the original data. You could use mutate instead of summarize to avoid this. Finally, you can then un-tidy your data again:
tidy_data %>%
group_by(id) %>%
mutate(mean = mean(value)) %>%
tidyr::spread(name, value)
id mean a b c
(dbl) (dbl) (dbl) (dbl) (dbl)
1 101 2.000000 1 2 3
2 102 2.333333 2 2 3
3 103 2.666667 3 2 3
Alternatively, you could summarise and then merge the result with the original table:
tidy_data %>%
group_by(id) %>%
summarize(mean = mean(value)) %>%
inner_join(data, by = 'id')
The result is the same in either case. I conceptually prefer the second variant.
I think the answer suggesting using data.frame or slicing on . is the best, but could be made simpler and more dplyr-ish like so:
data %>% mutate(c = rowMeans(select(., a,b)))
Or if you want to avoid ., with the penalty of having two inputs to your pipeline:
data %>% mutate(c = rowMeans(select(data, a,b)))
And yet another couple of ways, useful if you have the numeric positions or vector names of the columns to be summarised:
data %>% mutate(d = rowMeans(.[, 2:4]))
or
data %>% mutate(d = rowMeans(.[, c("a","b","c")]))
I think this is the dplyr-ish way. First, I'd create a function:
my_rowmeans = function(...) Reduce(`+`, list(...))/length(list(...))
Then, it can be used inside mutate:
data %>% mutate(rms = my_rowmeans(a, b))
# id a b c rms
# 1 101 1 2 3 1.5
# 2 102 2 2 3 2.0
# 3 103 3 2 3 2.5
# or
data %>% mutate(rms = my_rowmeans(a, b, c))
# id a b c rms
# 1 101 1 2 3 2.000000
# 2 102 2 2 3 2.333333
# 3 103 3 2 3 2.666667
To deal with the possibility of NAs, the function must be uglified:
my_rowmeans = function(..., na.rm=TRUE){
x =
if (na.rm) lapply(list(...), function(x) replace(x, is.na(x), as(0, class(x))))
else list(...)
d = Reduce(function(x,y) x+!is.na(y), list(...), init=0)
Reduce(`+`, x)/d
}
# alternately...
my_rowmeans2 = function(..., na.rm=TRUE) rowMeans(cbind(...), na.rm=na.rm)
# new example
data$b[2] <- NA
data %>% mutate(rms = my_rowmeans(a,b,na.rm=FALSE))
id a b c rms
1 101 1 2 3 1.5
2 102 2 NA 3 NA
3 103 3 2 3 2.5
data %>% mutate(rms = my_rowmeans(a,b))
id a b c rms
1 101 1 2 3 1.5
2 102 2 NA 3 2.0
3 103 3 2 3 2.5
The downside to the my_rowmeans2 is that it coerces to a matrix. I'm not certain that this will always be slower than the Reduce approach, though.
Another simple possibility with few code is:
data %>%
mutate(c= rowMeans(data.frame(a,b)))
# id a b c
# 1 101 1 2 1.5
# 2 102 2 2 2.0
# 3 103 3 2 2.5
As rowMeans needs something like a matrix or a data.frame, you can use data.frame(var1, var2, ...) instead of c(var1, var2, ...). If you have NAs in your data you'll need to tell R what to do, for example to remove them: rowMeans(data.frame(a,b), na.rm=TRUE)
If you'd like to use a pivot_longer()-style solution:
data%>%
pivot_longer(cols=-id)%>%
group_by(id)%>%
mutate(mean=mean(value))%>%
pivot_wider(names_from=name, values_from=value)
Note that this requires the tidyr package.
This is my preference for the fact that I only need to type the name of my ID column, and don't have to worry about column indices or names otherwise. Good for a quick copy-and-point-this-at-different-data solution, though the same can be said of other answers here. Also good for cases where you might have more than one column with categorical information and haven't created a single unique identifier column.
For what it's worth, I found that this solution is very easily modified to ignore NA values with simple addition of na.rm=TRUE in the mean calculation.
For example:
data <- data.frame(id=c(101,102,103), a=c(NA,2,3), b=c(2,2,2), c=c(3,3,3))
data%>%
pivot_longer(cols=-id)%>%
group_by(id)%>%
mutate(mean=mean(value,na.rm=TRUE))%>%
pivot_wider(names_from = name, values_from=value)
You can use a wrapper function around rowMeans() to make it easier to work with. The one below lets you specify na.rm, and you can use tidyselect to choose your columns if you want.
# This is the wrapper function
means <- function(..., na.rm = FALSE) {
rowMeans(data.frame(...), na.rm = na.rm)
}
library(dplyr)
# Example data
iris2 <- iris %>%
head() %>%
transmute(Sepal.Length = replace(Sepal.Length,
sample(c(TRUE, FALSE), nrow(.),
replace = TRUE),
NA),
Sepal.Width,
Petal.Length,
Petal.Width) %>%
print()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> 1 NA 3.5 1.4 0.2
#> 2 NA 3.0 1.4 0.2
#> 3 NA 3.2 1.3 0.2
#> 4 4.6 3.1 1.5 0.2
#> 5 NA 3.6 1.4 0.2
#> 6 5.4 3.9 1.7 0.4
# Basic usage
iris2 %>%
mutate(mean_sepal = means(Sepal.Length, Sepal.Width))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width mean_sepal
#> 1 NA 3.5 1.4 0.2 NA
#> 2 NA 3.0 1.4 0.2 NA
#> 3 NA 3.2 1.3 0.2 NA
#> 4 4.6 3.1 1.5 0.2 3.85
#> 5 NA 3.6 1.4 0.2 NA
#> 6 5.4 3.9 1.7 0.4 4.65
# If you want to exclude NAs
iris2 %>%
mutate(mean_sepal = means(Sepal.Length, Sepal.Width, na.rm = TRUE))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width mean_sepal
#> 1 NA 3.5 1.4 0.2 3.50
#> 2 NA 3.0 1.4 0.2 3.00
#> 3 NA 3.2 1.3 0.2 3.20
#> 4 4.6 3.1 1.5 0.2 3.85
#> 5 NA 3.6 1.4 0.2 3.60
#> 6 5.4 3.9 1.7 0.4 4.65
# You can also use select() and choose columns using tidyselect
iris2 %>%
mutate(mean_sepal = means(select(., contains("Sepal")), na.rm = TRUE))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width mean_sepal
#> 1 NA 3.5 1.4 0.2 3.50
#> 2 NA 3.0 1.4 0.2 3.00
#> 3 NA 3.2 1.3 0.2 3.20
#> 4 4.6 3.1 1.5 0.2 3.85
#> 5 NA 3.6 1.4 0.2 3.60
#> 6 5.4 3.9 1.7 0.4 4.65
Created on 2022-01-13 by the reprex package (v2.0.1)

Resources