I want to do something like
df1 <- iris %>% distinct(Species, .keep_all = TRUE) %>% group_by(Petal.Width) %>% summarise(Sepal.Length.mean1=mean(Sepal.Length), .groups = "drop")
df2 <- iris %>% distinct(Species, Petal.Width, .keep_all = TRUE) %>% group_by(Petal.Width) %>% summarise(Sepal.Length.mean2 =mean(Sepal.Length), .groups = "drop")
inner_join(df1, df2, by="Petal.Width")
But this is tedious to read because of the repetition. Is it possible to do all in one pipe? I cannot recover the initial dataset after distinct() so I wonder if there's a replacement to that.
A possible solution is to create first a function and then use it inside pipes:
library(tidyverse)
f <- function(df = iris, var1 = Species, var2 = Petal.Width,
var3 = Sepal.Length, i)
{
x <- enquo(var3)
{{df}} %>%
distinct({{var1}}, .keep_all = TRUE) %>% group_by({{var2}}) %>%
summarise(!!str_c(quo_name(x), ".mean", i , sep = "") := mean({{var3}}),
.groups = "drop")
}
inner_join(f(i = 1), f(i = 2), by="Petal.Width")
#> # A tibble: 3 × 3
#> Petal.Width Sepal.Length.mean1 Sepal.Length.mean2
#> <dbl> <dbl> <dbl>
#> 1 0.2 5.1 5.1
#> 2 1.4 7 7
#> 3 2.5 6.3 6.3
A workaround would be to use an expression with {}
Here is the beginning of the solution
iris %>% {
df1 <- distinct(., Species, .keep_all = TRUE)
df2 <- distinct(., Species, Petal.Width, .keep_all = TRUE)
list(df1, df2)} %>%
map(~ group_by(.x, Petal.Width)) # SOLUTION TO BE COMPLETED
Related
I have a data frame where I want to sum column values with the same prefix to produce a new column. My current problem is that it's not taking into account my group_by variable and returning identical values. Is part of the problem the .cols variable I'm selecting in the across function?
Sample data
library(dplyr)
library(purrr)
set.seed(10)
dat <- data.frame(id = rep(1:2, 5),
var1.pre = rnorm(10),
var1.post = rnorm(10),
var2.pre = rnorm(10),
var2.post = rnorm(10)
) %>%
mutate(index = id)
var_names = c("var1", "var2")
What I've tried
sumfunction <- map(
var_names,
~function(.){
sum(dat[glue("{.x}.pre")], dat[glue("{.x}.post")], na.rm = TRUE)
}
) %>%
setNames(var_names)
dat %>%
group_by(id) %>%
summarise(
across(
.cols = index,
.fns = sumfunction,
.names = "{.fn}"
)
) %>%
ungroup
Desired output
For this and similar problems I made the 'dplyover' package (it is not on CRAN). Here we can use dplyover::across2() to loop over two series of columns, first, all columns ending with "pre" and second all columns ending with "post". To get the names correct we can use .names = "{pre}" to get the common prefix of both series of columns.
library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/
dat %>%
group_by(id) %>%
summarise(across2(ends_with("pre"),
ends_with("post"),
~ sum(c(.x, .y)),
.names = "{pre}"
)
)
#> # A tibble: 2 × 3
#> id var1 var2
#> <int> <dbl> <dbl>
#> 1 1 -2.32 -5.55
#> 2 2 1.11 -9.54
Created on 2022-12-14 with reprex v2.0.2
Whenever operations across multiple columns get complicated, we could pivot:
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(-c(id, index),
names_to = c(".value", "name"),
names_sep = "\\.") %>%
group_by(id) %>%
summarise(var1 = sum(var1), var2=sum(var2))
id var1 var2
<int> <dbl> <dbl>
1 1 -2.32 -5.55
2 2 1.11 -9.54
I want to create a gt table where I see some metrics like number of observations, mean and median, and I want a column with its histogram. For this question I will use the iris dataset.
I have recently learned how to put a plot in a tibble using this code:
library(dplyr)
library(tidyr)
library(purrr)
library(gt)
my_tibble <- iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
summarise(obs = n(),
mean = round(mean(Values),2),
median = round(median(Values),2),
plots = list(ggplot(cur_data(), aes(Values)) + geom_histogram()))
Now I want to use the plots column for plotting an histogram per variable, so I have tried this:
my_tibble %>%
mutate(ggplot = NA) %>%
gt() %>%
text_transform(
locations = cells_body(vars(ggplot)),
fn = function(x) {
map(.$plots,ggplot_image)
}
)
But it returns me an error:
Error in body[[col]][stub_df$rownum_i %in% loc$rows] <- fn(body[[col]][stub_df$rownum_i %in% :
replacement has length zero
The gt table should be like this:
Any help will be greatly appreciated.
After reviewing the excellent ideas from #akrun and #TarJae, I have this solution that gives the required gt table:
plots <- iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
nest() %>%
mutate(plot = map(data,
function(df) df %>%
ggplot(aes(Values)) +
geom_histogram())) %>%
select(-data)
iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
summarise(obs = n(),
mean = round(mean(Values),2),
median = round(median(Values),2)) %>%
mutate(ggplot = NA) %>%
gt() %>%
text_transform(
locations = cells_body(vars(ggplot)),
fn = function(x) {
map(plots$plot, ggplot_image, height = px(100))
}
)
And this is the table:
I had to create the plot outside the output table, so I could call it in the gt table.
We need to loop over the plots
library(dplyr)
library(tidyr)
library(purrr)
library(gt)
library(ggplot2)
iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
nest_by(Vars) %>%
mutate(n = nrow(data),
mean = round(mean(data$Values), 2),
median = round(median(data$Values), 2),
plots = list(ggplot(data, aes(Values)) + geom_histogram()), .keep = "unused") %>%
ungroup %>%
mutate(ggplot = NA) %>%
{dat <- .
dat %>%
select(-plots) %>%
gt() %>%
text_transform(locations = cells_body(c(ggplot)),
fn = function(x) {
map(dat$plots, ggplot_image, height = px(100))
}
)
}
-check for the output
Update: See comments:
For your purposes in accordance with a shiny app you may use summarytools see here: https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html
it is compatible with r shiny!
Here is a small example:
library(summarytools)
dfSummary(iris,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp")
view(dfSummary(iris))
Try this:
library(skimr)
skim(iris)
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
* <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 Sepal.Length 0 1 5.84 0.828 4.3 5.1 5.8 6.4 7.9 ▆▇▇▅▂
2 Sepal.Width 0 1 3.06 0.436 2 2.8 3 3.3 4.4 ▁▆▇▂▁
3 Petal.Length 0 1 3.76 1.77 1 1.6 4.35 5.1 6.9 ▇▁▆▇▂
4 Petal.Width 0 1 1.20 0.762 0.1 0.3 1.3 1.8 2.5 ▇▁▇▅▃
library(tidyverse)
#make a sample data frame
a <- c(2000,2000,2000,2000,2001,2001,2001,2001)
b <- c("M","M","M","F","F","M","F","F")
d<- c("Yes","No","Yes","No","No","Unknown","Unknown","Yes")
e <- c("Unknown","No","No","Yes","Unknown","Yes","No","Unknown")
df <- data.frame(a,b,d,e)
colnames(df) <- c("Year","Gender","q1","q2")
# make a table for q1
myvar <- c("Gender","q1")
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-q1) %>%
group_by(name,q1,value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = "q1", values_from = "summary_str")
#make the function creating a table
maketable <- function(df,x){
myvar <- c("gender",paste0(x))
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-get(x)) %>%
group_by(name,get(x),value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = paste0(x), values_from = "summary_str")
colnames(table1)
}
maketable(df,q1)
maketable(df,q2)
Error in paste0(x): object 'q1' not found.
I want to make a function, so that I can use it for q2.
Could anyone help to correct the code? or suggest a better way?
Output per variable is as below
If you want to pass in unquoted column names to your function, you can use the {{}} (embrace) operator to inject them into your commands. For example
maketable <- function(df,x){
df %>%
select(Gender, {{x}}) %>%
pivot_longer(-{{x}}) %>%
group_by(name,{{x}},value)%>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = {{x}}, values_from = "summary_str")
}
table1 <-maketable(df, q1)
See the programming with dplyr guide for more information.
Also note that the function just returns the new value. If you want to assign that to a new variable, make sure you do that outside the function. Values created inside of functions will not appear outside.
I have tried this one here
my_func = function(x)
{
new_df = df %>% group_by(Gender) %>% count({{x}}) %>% pivot_wider(names_from = {{x}}, values_from = n)
return(new_df)
}
I'm not sure that this is what you asked
colns <- colnames(df)
lapply(colns[c(3:4)], function(x) {
myvar <- c("Gender", x)
mydf <- df[,myvar]
table1 <- mydf%>%
pivot_longer(-x) %>%
group_by_all %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = x, values_from = "summary_str")
})
result is like
[[1]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 2(25%) 1(12.5%) 1(12.5%)
2 Gender M 1(12.5%) 1(12.5%) 2(25%)
[[2]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 1(12.5%) 2(25%) 1(12.5%)
2 Gender M 2(25%) 1(12.5%) 1(12.5%)
You may need to change
lapply(colns[c(3:4)],...
3:4 to 3:102 for q1~q100
I cannot figure out why the bang-bang operator in my function is not unquoting my grp argument. Any help would be much appreciated!
library(dplyr)
test_func <- function(dat, grp){
dat %>%
group_by(!!grp) %>%
summarise(N = n())
}
test_func(dat = iris, grp = "Species")
Instead of grouping by species it just produces the summary for the entire data:
If we are passing a string, then convert to symbol and evaluate (!!)
test_func <- function(dat, grp){
dat %>%
group_by(!! rlang::ensym(grp)) %>%
summarise(N = n(), .groups = 'drop')
}
-testing
test_func(dat = iris, grp = "Species")
# A tibble: 3 x 2
# Species N
#* <fct> <int>
#1 setosa 50
#2 versicolor 50
#3 virginica 50
Or another option is to use across
test_func <- function(dat, grp){
dat %>%
group_by(across(all_of(grp))) %>%
summarise(N = n(), .groups = 'drop')
}
I am trying to generate a table of summary statistics using purrr/tibble methods. I am able to calculate group-wise mean (sd) and counts using the following:
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = var_group, value = summary) %>%
unnest()
My immediate question is, how do I retain the column names as seen in spread(key = var_group, value = summary) in the unnest()-ed output?
edit: Thanks to all for the responses.
https://stackoverflow.com/a/55912326/5745045 has the advantages of being easier to read and not storing a temporary variable. A disadvantage is the change of numeric to character in the n columns.
The final goal is to replace the column names with formatted text within the context of a grouped kable table.
By storing the "nested" tibble as a temporary variable1 and using its colnames2, we can achieve what you desire. Look below;
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = var_group, value = summary) %>%
#1: storing the temporary nested variable
{. ->> temptibble} %>%
unnest() %>%
#2: renaming the columns of unnested output and removing temporary variable
rename_all(funs(paste0(., "_", rep(colnames(temptibble), each=2)))); rm(temptibble)
# # A tibble: 1 x 16
# n_0_0_hp mean_sd_0_0_hp n1_0_0_mpg mean_sd1_0_0_mpg n2_0_1_hp mean_sd2_0_1_hp n3_0_1_mpg mean_sd3_0_1_mpg
# <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
# 1 12 194.167 (33.36) 12 15.05 (2.774) 6 180.833 (98.816) 6 19.75 (4.009)
# n4_1_0_hp mean_sd4_1_0_hp n5_1_0_mpg mean_sd5_1_0_mpg n6_1_1_hp mean_sd6_1_1_hp n7_1_1_mpg mean_sd7_1_1_mpg
# <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
# 1 7 102.143 (20.932) 7 20.743 (2.471) 7 80.571 (24.144) 7 28.371 (4.758)
Here's another method that doesn't require creating a temporary variable. Instead of nesting the data at the end, I used gather() and unite() to restructure the data so that it ends up as one key and value pair.
library(tidyverse)
#> Registered S3 methods overwritten by 'ggplot2':
#> method from
#> [.quosures rlang
#> c.quosures rlang
#> print.quosures rlang
#> Registered S3 method overwritten by 'rvest':
#> method from
#> read_xml.response xml2
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
gather(key, value, -var_group) %>%
unite(var_group_key, var_group, key) %>%
spread(var_group_key, value)
#> # A tibble: 1 x 16
#> `0_0_hp_mean_sd` `0_0_hp_n` `0_0_mpg_mean_s… `0_0_mpg_n` `0_1_hp_mean_sd`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 194.167 (33.36) 12 15.05 (2.774) 12 180.833 (98.816)
#> # … with 11 more variables: `0_1_hp_n` <chr>, `0_1_mpg_mean_sd` <chr>,
#> # `0_1_mpg_n` <chr>, `1_0_hp_mean_sd` <chr>, `1_0_hp_n` <chr>,
#> # `1_0_mpg_mean_sd` <chr>, `1_0_mpg_n` <chr>, `1_1_hp_mean_sd` <chr>,
#> # `1_1_hp_n` <chr>, `1_1_mpg_mean_sd` <chr>, `1_1_mpg_n` <chr>
Created on 2019-04-29 by the reprex package (v0.2.1)