dplyr 0.7 - Specify grouping variable as string - r

I have some variable names specified as string (e.g. input from Shiny app) and I would like to use them in my dplyr and ggplot2 code as if they were variables.
I got it to work by trial and error, but I feel like there must be a better way. What is a better way to perform these operations?
library(rlang)
library(ggplot2)
library(dplyr)
someString <- "g1"
df <- tibble(
g1 = c(1, 1, 2, 2, 2),
g2 = c(1, 2, 1, 2, 1),
a = sample(5),
b = sample(5)
)
my_summarise <- function(df, group_var) {
print(group_var)
df %>%
group_by(!!group_var) %>%
summarise(a = mean(a))
}
my_plot <- function(df, group_var) {
print(group_var)
ggplot(data = df %>%
group_by(!!group_var) %>%
summarise(a = mean(a)),
aes_string(x = quo_name(group_var), y = "a")) +
geom_bar(stat = "identity")
}
my_summarise(df, quo(UQ(sym(someString))))
my_plot(df, quo(UQ(sym(someString))))

Either of these options are probably simpler:
my_summarise <- function(df, group_var) {
print(group_var)
df %>%
#Either works
group_by_at(.vars = group_var) %>%
#group_by(!!sym(group_var)) %>%
summarise(a = mean(a))
}
my_summarise(df,someString)
my_plot <- function(df, group_var) {
print(group_var)
ggplot(data = df %>%
group_by_at(.vars = group_var) %>%
#group_by(!!sym(group_var)) %>%
summarise(a = mean(a)),
aes_string(x = group_var, y = "a")) +
geom_bar(stat = "identity")
}
my_plot(df, someString)
...where you could use either group_by or group_by_at.

What about calling with my_summarise(df, as.name(someString))?

Related

Passing variables to ggpubr from function call

I am looking to wrap the following formula into a function for easier end use:
df %>%
group_by(a, b) %>%
summarize(avg=mean(c)) %>%
ggline(x="a", y="avg", color='b')
however the following returns the error "Error in is.factor(x) : object 'b' not found" even though is.factor(df$b) == TRUE
graph_var <- function(data_source, var) {
var2 <- enquo(var)
data_source %>%
group_by(a, !!var2 )%>%
summarize(avg=mean(c)) %>%
ggline(x="a", y="avg", color=shQuote(var) )+ grids(linetype = 'dashed')
}
graph_var(df, b)
I'm sure the issue lies somewhere around ggpubr using quotes in its arguments, but I can't track down exactly what I need to do get this to work.
For reproducibility:
library(tidyverse)
library(ggpubr)
set.seed(13)
df <- data.frame(
a = rep(1:10),
b = as.factor(rep(LETTERS[24:26], each = 10)),
c = rnorm(30)
)
#explicit declatation - this works
df %>%
group_by(a, b )%>%
summarize(avg=mean(c)) %>%
ggline(x="a", y="avg", color="b" )+ grids(linetype = 'dashed') #works
#declaired via variable, this also works
test_var <- "b"
df %>%
group_by(a, b )%>%
summarize(avg=mean(c)) %>%
ggline(x="a", y="avg", color=test_var )+ grids(linetype = 'dashed') #also works
#declaited via f(x) - yeilds error "Error in is.factor(x) : object 'b' not found"
graph_var_ex <- function(data_source, var) {
var2 <- enquo(var)
data_source %>%
group_by(a, !!var2 )%>%
summarize(avg=mean(c)) %>%
ggline(x="a", y="avg", color=shQuote(var) )+ grids(linetype = 'dashed')
}
graph_var_ex(df, b)
Try as.character(ensym(var)).
Other notes:
Your function included y="c" in the ggline() call, but this should be y="avg" since "c" no longer exists after your summarize().
You can use the {{ embracing operator as a shortcut for !!enquo() when passing var to group_by().
library(dplyr)
library(ggpubr)
# example data
set.seed(13)
df <- data.frame(
a = rep(1:10),
b = rep(LETTERS[24:26], each = 10),
c = rnorm(30)
)
graph_var <- function(data_source, var) {
var2 <- as.character(ensym(var))
data_source %>%
group_by(a, {{var}})%>%
summarize(avg = mean(c)) %>%
ggline(x = "a", y = "avg", color = var2) +
grids(linetype = "dashed")
}
graph_var(df, b)

Vectorization to extract and bind very nested data

I have some very nested data. Within my list-column-dataframes, there are some pieces I need to put together and I've done so in a single instance to get my desired dataframe:
a <- df[[2]][["result"]]#data
b <- df[[2]][["result"]]#coords
desired_df <- cbind(a, b)
My original Large list has 171 elements, meaning I have 1:171 (3.3 GB) to go inside those square brackets and would ideally end up with 171 desired dataframes (which I would then bind all together).
I haven't needed to write a loop in 10 years, but I don't see a tidyverse way to deal with this. I also no longer know how to write loops. There are definitely some elements in there that are junk and will fail.
You haven't provided any sort of minimal example of the data.
I've condensed it to mean something like this
base_data <- data.frame(group = c("a", "b", "c"), var1 = c(3, 1, 2),
var2 = c( 2, 4, 8))
base_data2 = matrix(
c(1, 2, 3, 4, 5, 6, 7, 8, 9),
nrow = 3,
ncol = 3,
byrow = TRUE
)
rownames(base_data2) = c("d", "e", "f")
methods::setClass(
"weird_object",
slots = c(data = "data.frame", coords = "matrix"),
prototype = list(data = base_data, coords = base_data2)
)
df <- list(
list(
result = new("weird_object")
),list(
result = new("weird_object")
),list(
result = new("weird_object")
),list(
result = new("weird_object")
)
)
And if I had such a list with these objects, then I could do
df %>%
map(. %>% {
list(data = .$result#data,
cooords = .$result#coords)
}) %>%
enframe() %>%
unnest_wider(value)
But the selecting / hoisting function might fail, thus
one can wrap it in a purrr::possibly, and
choose a reasonable default:
df %>%
map(possibly(. %>% {
list(data = .$result#data,
cooords = .$result#coords)
},
otherwise = list(data = NA, coords = NA))) %>%
enframe() %>%
unnest_wider(value)
Hopefully, this could be a step forward.
Next step is probably something resembling this:
df %>%
map(. %>% {
list(data = .$result#data,
coords = .$result#coords)
}) %>%
enframe() %>%
unnest_wider(value) %>%
mutate(coords = coords %>% map(. %>% as_tibble(rownames = "rowid"))) %>%
unnest(cols = c(data, coords)) %>%
#' rotating the thing now
pivot_longer(cols = c(group, rowid),
names_to = "var_name",
values_to = "var") %>%
select(-var_name) %>%
pivot_longer(cols = c(var1, var2, V1, V2, V3),
names_to = "var_name") %>%
pivot_wider(names_from = var, values_from = value) %>%
identity()
If I understand your data structure, which I probably don't, you could do:
library(tidyverse)
# Create dummy data
df <- mtcars
df$mpg <- list(result = I(list('test')))
df$mpg$result <- list("#data" = I(list('your data')))
df <- df %>% select(mpg, cyl)
df1 <- df
df2 <- df
# Pull data you're interested in.
# The index is 1 here, instead of 2, because it's fake data and not your data.
# Assuming the # is not unique, and is just parsed from JSON or some other format.
dont_at_me <- function(x){
a <- x[[1]][["result"]][["#data"]]
a
}
# Get a list of all of your data.frames
all_dfs <- Filter(function(x) is(x, "data.frame"), mget(ls()))
# Vectorize
purrr::map(all_dfs, ~dont_at_me(.))

How to use tidy evaluation with column name as strings?

I've read most of the documentation about tidy evaluation and programming with dplyr but cannot get my head around this (simple) problem.
I want to programm with dplyr and give column names as strings as input to the function.
df <- tibble(
g1 = c(1, 1, 2, 2, 2),
g2 = c(1, 2, 1, 2, 1),
a = sample(5),
b = sample(5)
)
my_summarise <- function(df, group_var) {
df %>%
group_by(group_var) %>%
summarise(a = mean(a))
}
my_summarise(df, 'g1')
This gives me Error : Column 'group_var' is unknown.
What must I change inside the my_summarise function in order to make this work?
We can use also ensym with !!
my_summarise <- function(df, group_var) {
df %>%
group_by(!!rlang::ensym(group_var)) %>%
summarise(a = mean(a))
}
my_summarise(df, 'g1')
Or another option is group_by_at
my_summarise <- function(df, group_var) {
df %>%
group_by_at(vars(group_var)) %>%
summarise(a = mean(a))
}
my_summarise(df, 'g1')
Convert the string column name to a bare column name using as.name() and then use the new {{}} (read Curly-Curly) operator as below:
library(dplyr)
df <- tibble(
g1 = c(1, 1, 2, 2, 2),
g2 = c(1, 2, 1, 2, 1),
a = sample(5),
b = sample(5)
)
my_summarise <- function(df, group_var) {
grp_var <- as.name(group_var)
df %>%
group_by({{grp_var}}) %>%
summarise(a = mean(a))
}
my_summarise(df, 'g1')
You can also use sym and !!
my_summarise <- function(df, group_var) {
df %>%
group_by(!!sym(group_var)) %>%
summarise(a = mean(a))
}
my_summarise(df, 'g1')
# A tibble: 2 x 2
g1 a
<dbl> <dbl>
1 1 3.5
2 2 2.67

Can't use !!arg in dplyr for mutate call

I can use !! to filter by a user-given variable but not to modify that same variable. The following function throws an error when created, but it works just fine if I delete the mutate call.
avg_dims <- function(x, y) {
y <- enquo(y)
x %>%
filter(!!y != "TOTAL") %>%
mutate(!!y = "MEAN") %>%
group_by(var1, var2)
}
The naming of the column on the lhs of assignment goes along with the assignment operator (:=) instead of the = operator. Also, the names should be either string or symbol. So, we can convert the quosure ('y' from enquo) to string (quo_name) and then do the evaluation (!!)
avg_dims <- function(x, y) {
y <- enquo(y)
y1 <- rlang::quo_name(y)
x %>%
filter(!!y != "TOTAL") %>%
mutate(!!y1 := "MEAN") %>%
group_by(var1, var2)
}
avg_dims(df1, varN)
data
set.seed(24)
df1 <- data.frame(var1 = rep(LETTERS[1:3], each = 4),
var2 = rep(letters[1:2], each = 6),
varN = sample(c("TOTAL", "hello", 'bc'), 12, replace = TRUE),
stringsAsFactors = FALSE)

Function with dplyr, tidyr and ggplot

How can I make a function that takes a column and uses that in dplyr, tidyr and ggplot?
df <- data.frame(date_col = c(1,1,2,2,3,4,4,5,5),
col_a = c('a','b','a','b','a','a','b','a','b'),
val_col = runif(9))
How do I write a function takes a parameter param_col instead of the hardcoded col_a
df %>%
group_by(date_col, col_a) %>%
summarise(val_col = sum(val_col)) %>%
complete(col_a, date_col) %>%
ggplot(aes(date_col, val_col, color = col_a)) +
geom_line()
The dplyr and ggplot calls work in the code outlined below. But how should the complete call be written? Or should complete_ be used?
Is there a more canonical way of doing this?
plot_nice_chart <- function(df, param_col) {
enq_param_col <- enquo(param_col)
str_param_col <- deparse(substitute(param_col))
# aggregate data based on group_by_col,
# explicitly fill in NA's for missing to avoid interpolation
df %>%
group_by(!!enq_param_col, date_col) %>%
summarise(val_col = sum(val_col)) %>%
complete(<what-should-be-here?>, date_col) %>%
ggplot(aes_string("date_col", "val_col", color = str_param_col)) +
geom_line()
}
The development version of tidyr, tidyr_0.6.3.9000, now uses tidyeval, so if you want to update to that you could use !! as you did in group_by.
plot_nice_chart <- function(df, param_col) {
enq_param_col <- enquo(param_col)
str_param_col <- deparse(substitute(param_col))
str_param_col
df %>%
group_by(!!enq_param_col, date_col) %>%
summarise(val_col = sum(val_col)) %>%
ungroup() %>%
complete(!!enq_param_col, date_col) %>%
ggplot(aes_string("date_col", "val_col", color = str_param_col)) +
geom_line()
}
Using the current version, you can use complete_ with variables as strings.
plot_nice_chart <- function(df, param_col) {
enq_param_col <- enquo(param_col)
str_param_col <- deparse(substitute(param_col))
df %>%
group_by(!!enq_param_col, date_col) %>%
summarise(val_col = sum(val_col)) %>%
ungroup() %>%
complete_( c(str_param_col, "date_col") ) %>%
ggplot(aes_string("date_col", "val_col", color = str_param_col)) +
geom_line()
}

Resources