Sample data
dat <-
data.frame(Sim.Y1 = rnorm(10), Sim.Y2 = rnorm(10),
Sim.Y3 = rnorm(10), obsY = rnorm(10),
ID = sample(1:10, 10), ID_s = rep(1:2, each = 5))
For the following vector, I want to calculate the mean across ID_s
simVec <- c('Sim.Y1.cor','Sim.Y2.cor')
for(s in simVec){
simRef <- simVec[s]
simID <- unlist(strsplit(simRef, split = '.cor',fixed = T))[1]
# this works
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(Sim.Y1))
# this doesn't work
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(!!(simID)))
}
How do I refer a column in dplyr not by its explicit name?
Note that your particular task can be performed without any non-standard evaluation by using summarize_at(), which works directly with strings:
simIDs <- stringr::str_split(simVec, ".cor") %>% purrr::map_chr(1)
# [1] "Sim.Y1" "Sim.Y2"
dat %>% dplyr::group_by(ID_s) %>% dplyr::summarise_at(simIDs, mean)
# # A tibble: 2 x 3
# ID_s Sim.Y1 Sim.Y2
# <int> <dbl> <dbl>
# 1 1 0.494 -0.0522
# 2 2 -0.104 -0.370
A custom suffix can also be supplied through the named list:
dat %>% dplyr::group_by(ID_s) %>% dplyr::summarise_at(simIDs, list(m=mean))
# # A tibble: 2 x 3
# ID_s Sim.Y1_m Sim.Y2_m <--- Note the _m suffix
# <int> <dbl> <dbl>
# 1 1 0.494 -0.0522
# 2 2 -0.104 -0.370
First, you have to use seq_along() if you want to index you vector with s.
Second, you are missing sym().
This should work:
simVec <- c('Sim.Y1.cor','Sim.Y3.cor')
for(s in seq_along(simVec)){
simRef <- simVec[s]
simID <- unlist(strsplit(simRef, split = '.cor',fixed = T))[1]
# this works
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(Sim.Y1))
# this doesn't work
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(!!sym(simID)))
}
edit: no Typo
Try this
library(dplyr)
dat %>% group_by(ID) %>%
summarise(mean_y1 =mean(Sim.Y1),
mean_y2 =mean(Sim.Y2),
mean_y3 =mean(Sim.Y3),
mean_obsY = mean(obsY))
I understand the question to be, how do you get a column without referencing the column name, i.e. using the index instead.
Let me know if my understanding is incorrect.
If not, I believe the easiest way would be as per below.
> df1 <- data.frame(ID_s=c('a','b','c'),Val=c('a1','b1','c1'))
> df1
ID_s Val
1 a a1
2 b b1
3 c c1
> df1[,1]
[1] a b c
Levels: a b c
If you want to save that as a dataframe, can be extended as per below:
cc <- data.frame(ID_s=df1[,1])
Hope this helps!
Related
Supposing a dataframe like this:
# example dataset
df <- data.frame(
rowid = 1:3,
a = c("ax","cz","by"),
b = c("cy","ax","bz"),
c = c("bz","ay","cx")
)
What would an efficient approach be to achieving the following transformation?
#> # A tibble: 3 x 4
#> rowid a b c
#> <int> <chr> <chr> <chr>
#> 1 x z y
#> 2 x y z
#> 3 y z x
The goal is to take the second character of each bigram and sort it into columns picked-out by the first character, for each row.
If possible, it would be useful to compare base R and Tidyverse solutions.
Since you were looking for a comparison of base and Tidyverse, I'll chime in with a base solution:
tdf <- t(df[-1])
tdf[] <- substr(tdf, 2, 2)[order(col(tdf), tdf)]
df[-1] <- t(tdf)
# rowid a b c
#1 1 x z y
#2 2 x y z
#3 3 y z x
To explain the 3 steps:
.1) take a copy of a t()ransposed version of the data
.2a) get the order within each row (col() now since it's transposed) of (implicitly the first letter of) each string
.2b) use this order to select from the second letter of each string and overwrite <- the transposed data.
.3) t()ranspose back to the original structure and overwrite <- the data in df
Benchmark on 30K rows
Base:
bigdf <- df[rep(1:3,10000),]
bigdf$rowid <- 1:30000
system.time({
tdf <- t(bigdf[-1])
tdf[] <- substr(tdf,2,2)[order(col(tdf), tdf)]
bigdf[-1] <- t(tdf)
})
## user system elapsed
## 0.023 0.000 0.023
Tidy:
bigdf <- df[rep(1:3,10000),]
bigdf$rowid <- 1:30000
library(dplyr)
library(tibble)
library(sjmisc)
library(stringr)
system.time({
bigdf %>%
rotate_df(cn=TRUE) %>%
mutate(across(everything(),sort)) %>%
rotate_df() %>%
mutate(across(everything(),~str_sub(.,2,-1))) %>%
rownames_to_column(var="rowid")
})
## user system elapsed
## 21.177 0.047 21.244
A Tidyverse solution partially inspired by this recent post using rotate_df() from the sjmisc package: https://stackoverflow.com/a/70682560/8068516
df <- data.frame(
rowid = 1:3,
a = c("ax","cz","by"),
b = c("cy","ax","bz"),
c = c("bz","ay","cx")
)
library(sjmisc)
df %>%
# transpose the dataframe keeping column names
rotate_df(cn=TRUE) %>%
# sort columns by first character
mutate(across(everything(),sort)) %>%
# transpose back
rotate_df() %>%
# remove first character from each string
mutate(across(everything(),~str_sub(.,2,-1))) %>%
# make `rowid` column
rownames_to_column(var="rowid")
The dataframe can optionally be turned into tibble with as_tibble() to exactly match the target output, giving:
#> # A tibble: 3 x 4
#> rowid a b c
#> <int> <chr> <chr> <chr>
#> 1 x z y
#> 2 x y z
#> 3 y z x
This solution will generalise to n-columns and is %>% compatible.
I have a table with columns
[Time, var1, var2, var3, var4...varN]
I need to calculate mean/SE per Time for each var1, var2...var n , and I want to do this programmatically for all variables, rather than 1 at a time which would involve a lot of copy-pasting.
Section 8.2.3 here https://tidyeval.tidyverse.org/dplyr.html is close to what I want but my below code:
x <- as.data.frame(matrix(nrow = 2, ncol = 3))
x[1,1] = 1
x[1,2] = 2
x[1,3] = 3
x[2,1] =4
x[2,2] = 5
x[2,3] = 6
names(x)[1] <- "time"
names(x)[2] <- "var1"
names(x)[3] <- "var2"
grouped_mean3 <- function(.data, ...) {
print(.data)
summary_vars <- enquos(...)
print(summary_vars)
summary_vars <- purrr::map(summary_vars, function(var) {
expr(mean(!!var, na.rm = TRUE))
})
print(summary_vars)
.data %>%
group_by(time)
summarise(!!!summary_vars) # Unquote-splice the list
}
grouped_mean3(x, var("var1"), var("var2"))
Yields
Error in !summary_vars : invalid argument type
And the original cause is "Must group by variables found in .data." and it finds a column that isn't in the dummy "x" that I generated for the purposes of testing. I have no idea what's happening, sadly.
How do I actually extract the mean from the new summary_vars and add it to the .data table? summary_vars becomes something like
[[1]]
mean(~var1, na.rm = TRUE)
[[2]]
mean(~var2, na.rm = TRUE)
Which seems close, but needs evaluation. How do I evaluate this? !!! wasn't working.
For what it's worth, I tried plugging the example in dplyr into this R engine https://rdrr.io/cran/dplyr/man/starwars.html and it didn't work either.
Help?
End goal would be a table along the lines of
[Time, var1mean, var2mean, var3mean, var4mean...]
Try this :
library(dplyr)
grouped_mean3 <- function(.data, ...) {
vars <- c(...)
.data %>%
group_by(time) %>%
summarise(across(all_of(vars), mean))
}
grouped_mean3(x, 'var1')
# time var1mean
# <dbl> <dbl>
#1 1 2
#2 4 5
grouped_mean3(x, 'var1', 'var2')
# time var1mean var2mean
# <dbl> <dbl> <dbl>
#1 1 2 3
#2 4 5 6
Perhaps this is what you are looking for?
x %>%
group_by(time) %>%
summarise_at(vars(starts_with('var')), ~mean(.,na.rm=T)) %>%
rename_at(vars(starts_with('var')),funs(paste(.,"mean"))) %>%
merge(x)
With your data (from your question) following is the output:
time var1mean var2mean var1 var2
1 1 2 3 2 3
2 4 5 6 5 6
I am writing a function that computes the mean of a variable according to some grouping (g1 and g2). I would like the function to take care of the case when the user just wants to compute the mean across the groups, so the group argument will be empty.
I want a solution using tidyverse.
Suppose the following:
y = 1:4
g1 = c('a', 'a', 'b', 'b')
g2 = c(1,2,1,2)
MyData = data.frame(g1, g2, y)
MyFun = function(group){
group_sym = syms(group)
MyData %>%
group_by(!!!group_sym) %>%
summarise(mean = mean(y))
}
# this works well
MyFun(group = c('g1', 'g2'))
Now suppose I want the mean of y across all groups. I would like the function be able to treat something like
MyFun(group = '')
or
MyFun(group = NULL)
So ideally I would like the group argument to be empty / null and thus MyData would not be grouped. One solution could be to add a condition at the beginning of the function checking if the argument is empty and if TRUE write summarise without group_by. But this is not elegant and my real code is much longer than just a few lines.
Any idea?
1) Use {{...}} and use g1 in place of 'g1':
MyFun = function(group) {
MyData %>%
group_by({{group}}) %>%
summarise(mean = mean(y)) %>%
ungroup
}
MyFun(g1)
## # A tibble: 2 x 2
## g1 mean
## <fct> <dbl>
## 1 a 1.5
## 2 b 3.5
MyFun()
## # A tibble: 1 x 1
## mean
## <dbl>
## 1 2.5
2) This approach uses 'g1' as in the question.
MyFun = function(group) {
group <- if (missing(group)) 'All' else sym(group)
MyData %>%
group_by(!!group) %>%
summarise(mean = mean(y)) %>%
ungroup
}
MyFun('g1')
## # A tibble: 2 x 2
## g1 mean
## <fct> <dbl>
## 1 a 1.5
## 2 b 3.5
MyFun()
## # A tibble: 1 x 2
## `"All"` mean
## <chr> <dbl>
## 1 All 2.5
3) This also works and gives the same output as (2).
MyFun = function(...) {
group <- if (...length()) syms(...) else 'All'
MyData %>%
group_by(!!!group) %>%
summarise(mean = mean(y)) %>%
ungroup
}
MyFun('g1')
MyFun()
A different approach consists of creating a fake group (named 'across_group') in the data, in the case of group is missing.
MyFun = function(group) {
if (missing(group)) MyData$across_group = 1
group <- if (missing(group)) syms('across_group') else syms(group)
MyData %>%
group_by(!!!group) %>%
summarise(mean = mean(y)) %>%
ungroup
}
MyFun()
# A tibble: 1 x 2
across_group mean
<dbl> <dbl>
1 1 2.5
I've a dataset with 18 columns from which I need to return the column names with the highest value(s) for each observation, simple example below. I came across this answer, and it almost does what I need, but in some cases I need to combine the names (like abin maxcolbelow). How should I do this?
Any suggestions would be greatly appreciated! If it's possible it would be easier for me to understand a tidyverse based solution as I'm more familiar with that than base.
Edit: I forgot to mention that some of the columns in my data have NAs.
library(dplyr, warn.conflicts = FALSE)
#turn this
Df <- tibble(a = 4:2, b = 4:6, c = 3:5)
#into this
Df <- tibble(a = 4:2, b = 4:6, c = 3:5, maxol = c("ab", "b", "b"))
Created on 2018-10-30 by the reprex package (v0.2.1)
Continuing from the answer in the linked post, we can do
Df$maxcol <- apply(Df, 1, function(x) paste0(names(Df)[x == max(x)], collapse = ""))
Df
# a b c maxcol
# <int> <int> <int> <chr>
#1 4 4 3 ab
#2 3 5 4 b
#3 2 6 5 b
For every row, we check which position has max values and paste the names at that position together.
If you prefer the tidyverse approach
library(tidyverse)
Df %>%
mutate(row = row_number()) %>%
gather(values, key, -row) %>%
group_by(row) %>%
mutate(maxcol = paste0(values[key == max(key)], collapse = "")) %>%
spread(values, key) %>%
ungroup() %>%
select(-row)
# maxcol a b c
# <chr> <int> <int> <int>
#1 ab 4 4 3
#2 b 3 5 4
#3 b 2 6 5
We first convert dataframe from wide to long using gather, then group_by each row we paste column names for max key and then spread the long dataframe to wide again.
Here's a solution I found that loops through column names in case you find it hard to wrap your head around spread/gather (pivot_wider/longer)
out_df <- Df %>%
# calculate rowwise maximum
rowwise() %>%
mutate(rowmax = max(across())) %>%
# create empty maxcol column
mutate(maxcol = "")
# loop through column names
for (colname in colnames(Df)) {
out_df <- out_df %>%
# if the value at the specified column name is the maximum, paste it to the maxcol
mutate(maxcol = ifelse(.data[[colname]] == rowmax, paste0(maxcol, colname), maxcol))
}
# remove rowmax column if no longer needed
out_df <- out_df %>%
select(-rowmax)
I am kind of new to R and programming in general. I am currently strugling with a piece of code for data transformation and hope someone can take a little bit of time to help me.
Below a reproducible exemple :
# Data
a <- c(rnorm(12, 20))
b <- c(rnorm(12, 25))
f1 <- rep(c("X","Y","Z"), each=4) #family
f2 <- rep(x = c(0,1,50,100), 3) #reference and test levels
dt <- data.frame(f1=factor(f1), f2=factor(f2), a,b)
#library loading
library(tidyverse)
Goal : Compute all values (a,b) using a reference value. Calculation should be : a/a_ref with a_ref = a when f2=0 depending on the family (f1 can be X,Y or Z).
I tried to solve this by using this code :
test <- filter(dt, f2!=0) %>% group_by(f1) %>%
mutate("a/a_ref"=a/(filter(dt, f2==0) %>% group_by(f1) %>% distinct(a) %>% pull))
I get :
test results
as you can see a is divided by a_ref. But my script seems to recycle the use of reference values (a_ref) regardless of the family f1.
Do you have any suggestion so A is computed with regard of the family (f1) ?
Thank you for reading !
EDIT
I found a way to do it 'manualy'
filter(dt, f1=="X") %>% mutate("a/a_ref"=a/(filter(dt, f1=="X" & f2==0) %>% distinct(a) %>% pull()))
f1 f2 a b a/a_ref
1 X 0 21.77605 24.53115 1.0000000
2 X 1 20.17327 24.02512 0.9263973
3 X 50 19.81482 25.58103 0.9099366
4 X 100 19.90205 24.66322 0.9139422
the problem is that I'd have to update the code for each variable and family and thus is not a clean way to do it.
# use this to reproduce the same dataset and results
set.seed(5)
# Data
a <- c(rnorm(12, 20))
b <- c(rnorm(12, 25))
f1 <- rep(c("X","Y","Z"), each=4) #family
f2 <- rep(x = c(0,1,50,100), 3) #reference and test levels
dt <- data.frame(f1=factor(f1), f2=factor(f2), a,b)
#library loading
library(tidyverse)
dt %>%
group_by(f1) %>% # for each f1 value
mutate(a_ref = a[f2 == 0], # get the a_ref and add it in each row
"a/a_ref" = a/a_ref) %>% # divide a and a_ref
ungroup() %>% # forget the grouping
filter(f2 != 0) # remove rows where f2 == 0
# # A tibble: 9 x 6
# f1 f2 a b a_ref `a/a_ref`
# <fctr> <fctr> <dbl> <dbl> <dbl> <dbl>
# 1 X 1 21.38436 24.84247 19.15914 1.1161437
# 2 X 50 18.74451 23.92824 19.15914 0.9783583
# 3 X 100 20.07014 24.86101 19.15914 1.0475490
# 4 Y 1 19.39709 22.81603 21.71144 0.8934042
# 5 Y 50 19.52783 25.24082 21.71144 0.8994260
# 6 Y 100 19.36463 24.74064 21.71144 0.8919090
# 7 Z 1 20.13811 25.94187 19.71423 1.0215013
# 8 Z 50 21.22763 26.46796 19.71423 1.0767671
# 9 Z 100 19.19822 25.70676 19.71423 0.9738257
You can do this for more than one variable using:
dt %>%
group_by(f1) %>%
mutate_at(vars(a:b), funs(./.[f2 == 0])) %>%
ungroup()
Or generally use vars(a:z) to use all variables between a and z as long as they are one after the other in your dataset.
Another solution could be using mutate_if like:
dt %>%
group_by(f1) %>%
mutate_if(is.numeric, funs(./.[f2 == 0])) %>%
ungroup()
Where the function will be applied to all numeric variables you have. The variables f1 and f2 will be factor variables, so it just excludes those ones.