Suppose I have this tibble with an arbitrary number of variable pairs x and x_var, y and y_var, etc.
dt <- tibble(x = 1:3,
y = 2:4,
z = 3:5,
x_var = rep(0.1, 3),
y_var = rep(0.2, 3),
z_var = rep(0.3, 3))
I was attempting to calculate x + x_var, y + y_var, etc all in one go, using mutate-across.
I tried
tb %>%
mutate(across(.cols = all_of(c("x", "y", "z")),
.names = "{col}_sum",
function(x) x + !!rlang::sym(paste0(cur_column(), "_var"))))
but this does not seem to work. I do not want to hard-code variable names and see that it can be done via pivoting, however I'm curious if mutate-across will do the trick somehow.
You’re on the right track with paste0(cur_column(), "_var"). Instead of using sym(), use your computed column name to index into cur_data():
library(dplyr)
tb %>%
mutate(across(
.cols = c(x, y, z),
.fns = \(x) x + cur_data()[[paste0(cur_column(), "_var")]],
.names = "{col}_sum"
))
# A tibble: 3 × 9
x y z x_var y_var z_var x_sum y_sum z_sum
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 0.1 0.2 0.3 1.1 2.2 3.3
2 2 3 4 0.1 0.2 0.3 2.1 3.2 4.3
3 3 4 5 0.1 0.2 0.3 3.1 4.2 5.3
If the columns are in the same order, we could do
library(dplyr)
dt %>%
mutate(across(x:z, .names = "{.col}_sum") +
across(ends_with("_var")))
-output
# A tibble: 3 × 9
x y z x_var y_var z_var x_sum y_sum z_sum
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 0.1 0.2 0.3 1.1 2.2 3.3
2 2 3 4 0.1 0.2 0.3 2.1 3.2 4.3
3 3 4 5 0.1 0.2 0.3 3.1 4.2 5.3
Or another option is to loop across one set and then modify the OP's code by retrieving the value of the pasted column name with get
dt %>%
mutate(across(x:z, ~ .x +
get(paste0(cur_column(), "_var")), .names = "{.col}_sum"))
-output
# A tibble: 3 × 9
x y z x_var y_var z_var x_sum y_sum z_sum
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 0.1 0.2 0.3 1.1 2.2 3.3
2 2 3 4 0.1 0.2 0.3 2.1 3.2 4.3
3 3 4 5 0.1 0.2 0.3 3.1 4.2 5.3
Or use dplyover
library(dplyover)
library(stringr)
dt %>%
mutate(across2(x:z, x_var:z_var, ~ .x + .y,
.names_fn = ~ str_replace(.x, "_._var", "_sum")))
-output
# A tibble: 3 × 9
x y z x_var y_var z_var x_sum y_sum z_sum
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 0.1 0.2 0.3 1.1 2.2 3.3
2 2 3 4 0.1 0.2 0.3 2.1 3.2 4.3
3 3 4 5 0.1 0.2 0.3 3.1 4.2 5.3
Or this is much simpler in base R
dt[paste0(names(dt)[1:3], "_sum")] <- dt[1:3] + dt[4:6]
I would like to replace NAs in a data frame using na_if in column Value conditonal on column Category. But instead of condition used below I would like to replace it in case it is not equal to "cat_1".
data_B <- data_A %>%
mutate(Value = na_if(Category, "cat_1"))
Can it be modified? Equality operators do not seem to work.
Note: na_if function keeps original values in a column whilst replacing part of them with NAs (it does not substitute Category values in the Value column in this example)
I don't think it is directly possible with na_if, but you can use replace + != instead, or case_when + ==:
library(dplyr)
data.frame(Category = paste0("cat_", 1:4)) %>%
mutate(Value = replace(Category, Category != "cat_1", NA),
Value2 = case_when(Category == "cat_1" ~ Category))
output
Category Value Value2
1 cat_1 cat_1 cat_1
2 cat_2 <NA> <NA>
3 cat_3 <NA> <NA>
4 cat_4 <NA> <NA>
If your variable is a factor or your willing to convert:
df <- df |>
mutate(
Value = factor(df$Category, levels = "cat_1"),
Value2 = as.character(Value) # Converting factor to character
)
# 'data.frame': 4 obs. of 3 variables:
# $ Category: Factor w/ 4 levels "cat_1","cat_2",..: 1 2 3 4
# $ Value : Factor w/ 1 level "cat_1": 1 NA NA NA
# $ Value2 : chr "cat_1" NA NA NA
# Category Value Value2
# 1 cat_1 cat_1 cat_1
# 2 cat_2 <NA> <NA>
# 3 cat_3 <NA> <NA>
# 4 cat_4 <NA> <NA>
Data:
df = data.frame(Category = factor(paste0("cat_", 1:4)))
In my opinion Maël's answer is the easiest solution, but another potential option is to create your own function; looking at the source code for na_if() you could Negate() the vec_equal() to create your own na_if_not() function and still retain the utility and behaviour of na_if(), i.e.
Simple example:
library(tidyverse)
library(vctrs)
na_if_not <- function(x, y) {
y <- vec_cast(x = y, to = x, x_arg = "y", to_arg = "x")
y <- vec_recycle(y, size = vec_size(x), x_arg = "y")
na <- vec_init(x)
vec_not_equal <- Negate(vec_equal)
where <- vec_not_equal(x, y, na_equal = TRUE)
x <- vec_assign(x, where, na)
x
}
df <- data.frame(Category = paste0("cat_", 1:4),
Value = paste0("cat_", 1:4),
Value2 = paste0("cat_", 1:4))
df %>%
mutate(Value = na_if_not(Value, "cat_1"),
Value2 = na_if_not(Category, "cat_1"))
#> Category Value Value2
#> 1 cat_1 cat_1 cat_1
#> 2 cat_2 <NA> <NA>
#> 3 cat_3 <NA> <NA>
#> 4 cat_4 <NA> <NA>
Created on 2022-09-30 by the reprex package (v2.0.1)
Replacing "setosa's" (na_if()) and "everything-but-setosa's" (na_if_not()) in place:
library(tidyverse)
library(vctrs)
na_if_not <- function(x, y) {
y <- vec_cast(x = y, to = x, x_arg = "y", to_arg = "x")
y <- vec_recycle(y, size = vec_size(x), x_arg = "y")
na <- vec_init(x)
vec_not_equal <- Negate(vec_equal)
where <- vec_not_equal(x, y, na_equal = TRUE)
x <- vec_assign(x, where, na)
x
}
# na_if() example
iris %>%
head() %>%
mutate(Species = c("Setosa", "virginica", "versicolor",
"Setosa", "virginica", "versicolor")) %>%
mutate(Species = na_if(Species, "Setosa"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
#> 2 4.9 3.0 1.4 0.2 virginica
#> 3 4.7 3.2 1.3 0.2 versicolor
#> 4 4.6 3.1 1.5 0.2 <NA>
#> 5 5.0 3.6 1.4 0.2 virginica
#> 6 5.4 3.9 1.7 0.4 versicolor
# na_if_not() example
iris %>%
head() %>%
mutate(Species = c("Setosa", "virginica", "versicolor",
"Setosa", "virginica", "versicolor")) %>%
mutate(Species = na_if_not(Species, "Setosa"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 Setosa
#> 2 4.9 3.0 1.4 0.2 <NA>
#> 3 4.7 3.2 1.3 0.2 <NA>
#> 4 4.6 3.1 1.5 0.2 Setosa
#> 5 5.0 3.6 1.4 0.2 <NA>
#> 6 5.4 3.9 1.7 0.4 <NA>
Created on 2022-09-30 by the reprex package (v2.0.1)
I am running an experiment where participants are randomly assigned to one of two conditions, and then I collect data on several variables. Here is an example of my code:
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
For each variable (var1, var2, etc.) I would like a little table with the count, mean, and standard deviation. This code creates the kind of table that I want:
group_by(df, df$condition) %>%
summarise(
count = n(),
mean = mean(var1),
sd = sd(var1))
But because I have many variables, I would like to use some kind of loop (or "lapply"?) to create all these tables at once. It would also be great if each table could show the name of the variable. Thanks!
You can just use summarise on all the variables, i.e.
library(dplyr)
group_by(df, condition) %>%
summarise(across(everything(), ~ c(count = n(), mean = mean(.), sd = sd(.))))
`summarise()` has grouped output by 'condition'. You can override using the `.groups` argument.
# A tibble: 6 x 5
# Groups: condition [2]
condition var1 var2 var3 var4
<fct> <dbl> <dbl> <dbl> <dbl>
1 Digital 5 5 5 5
2 Digital 5.8 2.2 3.8 3
3 Digital 1.10 1.64 3.03 1.22
4 Physical 5 5 5 5
5 Physical 5.6 4.6 4.6 4.6
6 Physical 1.14 1.82 2.41 1.34
You can control the output structure by changing object in the formula, i.e.
group_by(df, condition) %>%
summarise(across(everything(), ~ data.frame(count = n(), mean = mean(.), sd = sd(.))))
# A tibble: 2 x 5
condition var1$count $mean $sd var2$count $mean $sd var3$count $mean $sd var4$count $mean $sd
<fct> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 Physical 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
We could still do it my summarise using a list:
library(dplyr)
df %>%
group_by(condition) %>%
summarise(across(starts_with("var"), .f = list(n = ~n(),
mean = mean,
sd = sd), na.rm = TRUE))
condition var1_n var1_mean var1_sd var2_n var2_mean var2_sd var3_n var3_mean var3_sd var4_n var4_mean var4_sd
<dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 -1 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 1 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
for (var in names(df)[2:length(names(df))]){
tab <- group_by(df, condition) %>%
select(c("condition", var)) %>%
dplyr::rename(v = var) %>%
summarise(
count = n(),
mean = mean(v),
sd = sd(v)
)
print(var)
print(tab)
}
gives
[1] "var1"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10
2 Physical 5 5.6 1.14
[1] "var2"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 2.2 1.64
2 Physical 5 4.6 1.82
[1] "var3"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3.8 3.03
2 Physical 5 4.6 2.41
[1] "var4"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3 1.22
2 Physical 5 4.6 1.34
>
Rather than lapply, the function of choice is aggregate, a close relative to the *apply family at least. Put in a custom function f.
f <- \(x) c(n=length(x), mu=mean(x), sd=sd(x))
aggregate(. ~ condition, df, f)
# condition var1.n var1.mu var1.sd var2.n var2.mu var2.sd var3.n var3.mu var3.sd var4.n var4.mu var4.sd
# 1 Digital 5.000000 5.800000 1.095445 5.000000 2.200000 1.643168 5.000000 3.800000 3.033150 5.000000 3.000000 1.224745
# 2 Physical 5.000000 5.600000 1.140175 5.000000 4.600000 1.816590 5.000000 4.600000 2.408319 5.000000 4.600000 1.341641
If you want to aggregate on a specific set of variables (e.g. assembled with grep), use list notation instead.
aggregate(df[grep('^var', names(df))], df['condition'], f)
You can use gtsummary here if you need to present the results.
Example one below will make one table with all of your variables. Example two will split each variable into its own table (if you need them to be seperate)
library(gtsummary)
#example one:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) "))
#example two:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) ")) %>%
tbl_split(variables = c(var1, var2,var3,var4))
I have the following data frame:
df<- splitstackshape::stratified(iris, group="Species", size=1)
I want to make a z-score for each species including all of the variables. I can do this manually by finding the SD and mean for each row and using the appropriate formula, but I need to do this several times over and would like to find a more efficient way.
I tried using scale(), but can't figure out how to get it to do the row-wise calculation that includes several variables and a grouping variable.
Using dplyr::group_by returns a "'x' must be numeric variable" error.
Are you sure the question is taking a z-score to each group? It should be for each value.
Lets say the functions to take z-score could be:
scale(x, center = TRUE, scale = TRUE)
Or
function_zscore = function(x){x <- x[na.rm = TRUE]; return(((x) - mean(x)) / sd(x))}
Both functions suggest that if the argument x is a vector, the results will return to a vector too.
df<- splitstackshape::stratified(iris, group="Species", size=1)
df <- tidyr::pivot_longer(df, cols = c(1:4), names_to = "var.name", values_to = "value")
df %>%
group_by(Species) %>%
mutate(zscore = scale(value, center = TRUE, scale = TRUE)[,1])
## A tibble: 12 x 4
## Groups: Species [3]
# Species var.name value zscore
# <fct> <chr> <dbl> <dbl>
# 1 setosa Sepal.Length 4.9 1.22
# 2 setosa Sepal.Width 3.1 0.332
# 3 setosa Petal.Length 1.5 -0.455
# 4 setosa Petal.Width 0.2 -1.09
# 5 versicolor Sepal.Length 5.9 1.10
# 6 versicolor Sepal.Width 3.2 -0.403
# 7 versicolor Petal.Length 4.8 0.486
# 8 versicolor Petal.Width 1.8 -1.18
# 9 virginica Sepal.Length 6.5 1.14
#10 virginica Sepal.Width 3 -0.574
#11 virginica Petal.Length 5.2 0.501
#12 virginica Petal.Width 2 -1.06
If we still hope to get a score for each group to describe how a sample deviates around the mean, a possible solution could be getting the coefficient of variation?
df %>%
group_by(Species) %>%
summarise(coef.var = 100*sd(value)/mean(value))
## A tibble: 3 x 2
# Species coef.var
# <fct> <dbl>
#1 setosa 83.8
#2 versicolor 45.8
#3 virginica 49.0
I'd like to fit models to a grouped data frame and then predict one new value per model (i.e. group).
library(dplyr)
library(broom)
data(iris)
dat <- rbind(iris, iris)
dat$Group <- rep(c("A", "B"), each = 150)
new.dat <- data.frame(Group = rep(c("A", "B"), each = 3),
Species = rep(c("setosa", "versicolor", "virginica"), times = 2),
Sepal.Width = 1:6)
> new.dat
Group Species val
1 A setosa 1
2 A versicolor 2
3 A virginica 3
4 B setosa 4
5 B versicolor 5
6 B virginica 6
However, augment returns 36 rows, as if each new value is fit with each model. How can I preserve the grouping here and get one fitted value per group?
dat %>%
group_by(Species, Group) %>%
do(augment(lm(Sepal.Length ~ Sepal.Width, data = .), newdata = new.dat))
# A tibble: 36 x 5
# Groups: Species, Group [6]
Group Species Sepal.Width .fitted .se.fit
<fct> <fct> <int> <dbl> <dbl>
1 A setosa 1 3.33 0.221
2 A versicolor 2 4.02 0.133
3 A virginica 3 4.71 0.0512
4 B setosa 4 5.40 0.0615
5 B versicolor 5 6.09 0.145
6 B virginica 6 6.78 0.234
7 A setosa 1 3.33 0.221
8 A versicolor 2 4.02 0.133
9 A virginica 3 4.71 0.0512
10 B setosa 4 5.40 0.0615
# ... with 26 more rows
(Note that due to the example data the rows are actually duplicates, which is however not the case with my original data).
You need to make the Species and Group of new.dat match those of the group currently being processed in do. You can do this like so:
group.cols <- c("Species", "Group")
dat %>%
group_by(!!! group.cols) %>%
do(augment(lm(Sepal.Length ~ Sepal.Width, data = .),
newdata = semi_join(new.dat, ., by = group.cols)))