There are six regression models. I used pivot wider but it is difficult to read.
Can I use two level of headers -
first level - regression model
second level - estimate, tstat
library(dplyr)
regression <- c(rep("A", 3), rep("B", 3), rep("C", 3), rep("D", 3), rep("E", 3), rep("F", 3))
attribute <- rep(c("b0", "b1", "b2"), 6)
estimate <- round(runif(n = 18, min = 0, max = 10), 2)
tstat <- round(runif(n = 18, min = 0, max = 10), 2)
# tibble
tbl <- tibble(regression, attribute, estimate, tstat)
# pivot wider
tbl <- tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat"))
One option is separate_header from ftExtra
library(ftExtra)
library(dplyr)
library(tidyr)
library(stringr)
tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat")) %>%
rename_with(~ str_replace(., "(.*)_(.*)", "\\2_\\1"), -1) %>%
as_flextable() %>%
separate_header()
-output
Or may use span_header
library(flextable)
tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat")) %>%
rename_with(~ str_replace(., "(.*)_(.*)", "\\2_\\1"), -1) %>%
select(attribute, order(str_remove(names(.)[-1], "_.*")) + 1) %>%
as_flextable() %>%
span_header() %>%
align(align = "center", part = "all")
-output
If we need to make some column bold,
tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat")) %>%
rename_with(~ str_replace(., "(.*)_(.*)", "\\2_\\1"), -1) %>%
mutate(across(ends_with('tstat'), ~sprintf('**%.2f**', .))) %>%
select(attribute, order(str_remove(names(.)[-1], "_.*")) + 1) %>%
as_flextable() %>%
span_header() %>%
align(align = "center", part = "all") %>%
colformat_md()
-output
Related
Hi everyone I'm trying to run below code for calculating descriptive statistics for all variables across values of "years" for some reason dplyr gives below error:
Error: Problem with summarise() input ..1.
x Can't subset columns that don't exist.
x Column year doesn't exist.
i Input ..1 is (function (.cols = everything(), .fns = NULL, ..., .names = NULL) ....
i The error occurred in group 1: year = 2018.
Run rlang::last_error() to see where the error
Please find the code below:
# Load required libraries
library(dplyr)
library(tidyr)
library(openxlsx)
# Create example dataset
set.seed(123)
n <- 1000
data <- data.frame(
var_1 = rnorm(n),renare
var_2 = rnorm(n),
var_3 = rnorm(n),
var_4 = sample(c("A", "B", "C"), n, replace = TRUE),
var_5 = sample(c("X", "Y", "Z"), n, replace = TRUE),
year = sample(2018:2020, n, replace = TRUE)
)
# Calculate descriptive statistics for numeric variables
num_vars <- data %>% select_if(is.numeric) %>% names()
num_stats <- data %>%
select(all_of(c(num_vars, "year"))) %>%
group_by(year) %>%
summarise(across(all_of(num_vars), list(mean = mean, sd = sd, median = median, min = min, max = max), .names = "{col}_{fn}_{year}")) %>%
pivot_longer(cols = -year, names_to = c("Variable", ".value", "Year"), names_sep = "_") %>%
pivot_wider(names_from = (Year, year), values_from = value, names_sep = "_") %>%
mutate(across(contains("mean"), list(diff = ~. - lag(.)))) %>%
mutate(across(contains("sd"), list(diff = ~. - lag(.)))) %>%
mutate(across(contains("median"), list(diff = ~. - lag(.)))) %>%
mutate(across(contains("min"), list(diff = ~. - lag(.)))) %>%
mutate(across(contains("max"), list(diff = ~. - lag(.)))) %>%
rename_all(~ gsub("^(mean|sd|median|min|max)_([^_]+)_(\\d+)$", paste0("\\1_\\2_\\3_"), .)) %>%
rename_with(~ gsub("^(diff)", paste0("\\1_", "\\3_to_\\4_"), .), contains("diff"))
# Create frequency table for categorical variables
cat_vars <- data %>% select_if(is.factor) %>% names()
cat_stats <- data %>%
select(all_of(c(cat_vars, "year"))) %>%
pivot_longer(cols = starts_with("var"), names_to = "Variable", values_to = "Value") %>%
group_by(Variable, Value, year) %>%
summarise(Count = n()) %>%
pivot_wider(names_from = "year", values_from = "Count", names_prefix = "Count_") %>%
unite("Variable:Value", c("Variable", "Value"), sep = ": ", remove = FALSE)
# Create Excel workbook and write outputs to separate worksheets
wb <- createWorkbook()
addWorksheet(wb, "Numeric Statistics")
writeData(wb, "Numeric Statistics", num_stats)
addWorksheet(wb, "Categorical Frequencies")
writeData(wb, "Categorical Frequencies", cat_stats)
# Save workbook to file
saveWorkbook(wb, "output.xlsx", overwrite = TRUE)
Any ideas on what could be an issue here?
I have the following data and table:
library(gt)
library(dplyr)
a <- rnorm(21, mean = 112, sd =12)
colour <- rep(c("Blue", "Red", "Green"), 7)
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O]")) %>%
rename(cat = colour)
b <- rnorm(21, mean = 60, sd =12)
day <- rep(c("2", "4", "6"), 7)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O] Additition <br> (Days)")) %>%
rename(cat = day)
bind_rows(data, data2) %>%
group_by(grp) %>%
gt(rowname_col = "cat")
bind_rows(data, data2) %>%
group_by(grp) %>%
gt() %>%
tab_options(row_group.as_column = TRUE)
The row group labels appear literally as '[H<sub>2<\sub>O]', rather than [H2O] etc. It is likely that I am using HTML wrong and it needs to be used with another package/function. I have also tried using cols_label but doesn't recognise these as columns in the dataframe.
Is there also a way to have the row groups column vertically centered, rather than at the top where is currently is? How do you bold these row groups?
The html function won't work outside of a gt table, so you'll have to create the row groups using tab_row_group and add the html labels there.
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "color") %>%
rename(cat = colour)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "day") %>%
rename(cat = day)
bind_rows(data, data2) %>%
gt() %>%
tab_row_group(
label = html("[H<sub>2</sub>O]"),
rows = grp == "color"
) %>%
tab_row_group(
label = html("[H<sub>2</sub>O] Additition <br> (Days)"),
rows = grp == "day"
) %>%
cols_hide(grp)
I have the following function with which I want to create summary statistics (for two data sets simultaneously):
mean.k <-function(x){round(mean(x, na.rm=TRUE), digits = 3)}
sd.k <-function(x){round(sd(x, na.rm=TRUE), digits = 3)}
sumstats<-function(x, y) { sumtable <- cbind(as.matrix(colSums(!is.na(x))),sapply(x,mean.k), paste("(",sapply(x,sd.k),")", sep = ""), as.matrix(colSums(!is.na(y))),sapply(y,mean.k), paste("(",sapply(y,sd.k),")", sep = ""))
sumtable=as.data.frame(sumtable)
names(sumtable)=c("Obs","Mean","Std.Dev", "Obs","Mean","Std.Dev");
sumtable}
On some data, the result looks like:
data(iris)
libary(dplyr)
iris_1 <- iris %>% filter(Species == "setosa") %>% select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) # data set 1
iris_2 <- iris %>% filter(Species == "versicolor") %>% select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) # data set 2
iris_desk_stats <- sumstats(iris_1, iris_2)
However, for me the optimal result would look like this:
So the two changes I need are the following:
Each standard deviation should be placed under the respective mean instead of in a separate column next to the means.
Obs should not be a proper column, but a row at the top in order to avoid redundancy.
Has anyone an idea? The first of the two desired changes would be the most important one.
I thought that I'd try to approach this "directly"; And here's what I've come up with:
iris %>%
as_tibble() %>%
summarise(
across(!where(is.factor), list(
mean = . %>% mean(na.rm = TRUE),
sd = . %>% sd(na.rm = TRUE)
)),
Obs = n()
) %>%
pivot_longer(
# c(contains(c("mean", "sd")), "Obs"),
everything(),
names_to = c("variable", "metric"),
values_to = "stats",
names_sep = "_") %>%
tidyr::replace_na(list(metric = "identity")) %>%
pivot_wider(names_from = metric,
values_from = stats) %>%
select(variable, everything()) %>%
mutate(entries = glue::glue("{mean}<br>({sd})",
mean = round(mean, 3),
sd = round(sd, 3)),
entries = if_else(!is.na(identity),
glue::glue("{identity}"),
entries)) %>%
select(-c("mean", "sd", "identity")) %>%
arrange(variable) %>%
gt::gt() %>%
gt::fmt_markdown(entries) %>%
identity()
I am trying to create a 95% minimum convex polygon using sf in R. My code works fine as long as I only group my data on 1 variable, but when I group on two variables, the output loses its sf class and becomes a grouped_df instead.
Here is a toy dataset as an example
library(dplyr)
set.seed(12)
toy <- tibble::tibble(
ID = rep(c(1,2), each = 10),
year = rep(c(1,2), 10),
lat = runif(20, 1, 10),
long = runif(20, 1, 10)
) %>%
sf::st_as_sf(., coords = c("long", "lat"))
toy %>%
group_by(ID) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize() %>%
sf::st_convex_hull() %>%
class()
This gives the output I want. But when I try to group by two variables, the result loses the sf class.
toy %>%
group_by(ID, year) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize() %>%
sf::st_convex_hull() %>%
class
Is there something in my code keeping me from being able to group on two variables?
It's because the second summarize is regrouping on ID. You need a .groups = "keeps" there to pass the same grouping through and then convert back to an sf object. Alternatively you could create a grouping variable mutate(grp = paste0(ID, year)) and do group_by(grp) .
toy %>%
group_by(ID, year) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize(.groups = "keep") %>%
sf::st_convex_hull() %>%
st_sf()
I have an issue understanding how to use the dplyr bootstrap function properly.
What I want is to generate a bootstrap distribution from two randomly assigned groups and compute the difference in means, like this for example :
library(dplyr)
library(broom)
data(mtcars)
mtcars %>%
mutate(treat = sample(c(0, 1), 32, replace = T)) %>%
group_by(treat) %>%
summarise(m = mean(disp)) %>%
summarise(m = m[treat == 1] - m[treat == 0])
The issue is that I need to repeat this operation 100, 1000, or more times.
Using replicate, I can do
frep = function(mtcars) mtcars %>%
mutate(treat = sample(c(0, 1), 32, replace = T)) %>%
group_by(treat) %>%
summarise(m = mean(disp)) %>%
summarise(m = m[treat == 1] - m[treat == 0])
replicate(1000, frep(mtcars = mtcars), simplify = T) %>% unlist()
and get the distribution
I don't really get how to use bootstraphere. How should I start ?
mtcars %>%
bootstrap(10) %>%
mutate(treat = sample(c(0, 1), 32, replace = T))
mtcars %>%
bootstrap(10) %>%
do(tidy(treat = sample(c(0, 1), 32, replace = T)))
It's not really working. Where should I put the bootstrap pip ?
Thanks.
In the do step, we wrap with data.frame and create the 'treat' column, then we can group by 'replicate' and 'treat' to get the summarised output column
mtcars %>%
bootstrap(10) %>%
do(data.frame(., treat = sample(c(0,1), 32, replace=TRUE))) %>%
group_by(replicate, treat) %>%
summarise(m = mean(disp)) %>%
summarise(m = m[treat == 1] - m[treat == 0])
#or as 1 occurs second and 0 second, we can also use
#summarise(m = last(m) - first(m))