I have two dataframes that I want to turn into a single table using gt
library(dplyr)
library(gt)
a <- rnorm(21, mean = 112, sd =12)
colour <- rep(c("Blue", "Red", "Green"), 7)
data <- data.frame(colour, a)
data <- data %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
ungroup() %>%
gt()
a <- rnorm(21, mean = 60, sd =12)
day <- rep(c("2", "4", "6"), 7)
data2 <- data.frame(day, a)
data2 <- data2 %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
ungroup() %>%
gt()
How do I stack the two dataframes on top of each other, and apply two sideways spanner labels of colour and day. Something similar to below where 2014, 2015 are my mean and sd columns, and China is colour, with blue, red green underneath, and India is day with the days stacked underneath.
OR (for curiosity/ideally).
Not have colour and day where China and India are, but instead have a sideways spanner. (i.e. vertical instead of horizontal). Horizontal isn't good for my real data as there is too many categories and would make it a really wide table.
You can give each a variable to identify what the group will be ("grp") and rename the color/day variable ("cat") and use bind_rows to combine them before using gt:
a <- rnorm(21, mean = 112, sd =12)
colour <- rep(c("Blue", "Red", "Green"), 7)
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "colour") %>%
rename(cat = colour)
b <- rnorm(21, mean = 60, sd =12)
day <- rep(c("2", "4", "6"), 7)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "day") %>%
rename(cat = day)
bind_rows(data, data2) %>%
group_by(grp) %>%
gt(rowname_col = "cat")
Also I don't think this is exactly what you are asking for on the second option, but there is a row_group.as_column option for tab_options:
bind_rows(data, data2) %>%
group_by(grp) %>%
gt() %>%
tab_options(row_group.as_column = TRUE)
Related
I have the following data and table:
library(gt)
library(dplyr)
a <- rnorm(21, mean = 112, sd =12)
colour <- rep(c("Blue", "Red", "Green"), 7)
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O]")) %>%
rename(cat = colour)
b <- rnorm(21, mean = 60, sd =12)
day <- rep(c("2", "4", "6"), 7)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O] Additition <br> (Days)")) %>%
rename(cat = day)
bind_rows(data, data2) %>%
group_by(grp) %>%
gt(rowname_col = "cat")
bind_rows(data, data2) %>%
group_by(grp) %>%
gt() %>%
tab_options(row_group.as_column = TRUE)
The row group labels appear literally as '[H<sub>2<\sub>O]', rather than [H2O] etc. It is likely that I am using HTML wrong and it needs to be used with another package/function. I have also tried using cols_label but doesn't recognise these as columns in the dataframe.
Is there also a way to have the row groups column vertically centered, rather than at the top where is currently is? How do you bold these row groups?
The html function won't work outside of a gt table, so you'll have to create the row groups using tab_row_group and add the html labels there.
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "color") %>%
rename(cat = colour)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "day") %>%
rename(cat = day)
bind_rows(data, data2) %>%
gt() %>%
tab_row_group(
label = html("[H<sub>2</sub>O]"),
rows = grp == "color"
) %>%
tab_row_group(
label = html("[H<sub>2</sub>O] Additition <br> (Days)"),
rows = grp == "day"
) %>%
cols_hide(grp)
Are there any R packages that I use to replicate the table below -
I would like a table with conditional formatting for the table values but no conditional formatting on the row and column grand totals.
The code can be used to reproduce the values in the table along with the row and column grand totals -
library(tidyverse)
# vectors
dates <- rep(date_vec <- c(as.Date("2022-01-01"), as.Date("2022-02-01"), as.Date("2022-03-01")), 30)
row_groups <- c(rep("row_group1", 20), rep("row_group2", 30), rep("row_group3", 10), rep("row_group4", 30))
col_groups <- c(rep("col_group1", 10), rep("col_group2", 10), rep("col_group3", 30), rep("col_group4", 40))
# dataframe
df <- tibble(dates, row_groups, col_groups)
# column grand totals
col_group_total <- df %>%
group_by(dates, col_groups) %>%
count() %>%
group_by(col_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))
# row grand totals
row_group_total <- df %>%
group_by(dates, row_groups) %>%
count() %>%
group_by(row_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))%>%
ungroup()
# table values
group_total <- df %>%
group_by(dates, row_groups, col_groups) %>%
count() %>%
group_by(row_groups, col_groups) %>%
summarise(count = mean(n)) %>%
ungroup() %>%
mutate(pct = count/sum(count))%>%
ungroup()
red_color <- "#f4cccc"
yellow_color <- "#f3f0ce"
green_color <- "#d9ead3"
library(janitor); library(gt)
df %>%
tabyl(row_groups, col_groups) %>%
adorn_percentages("all") %>%
adorn_totals(c("col")) -> df_tabyl
gt(df_tabyl) %>%
data_color(columns = col_group1:col_group4,
colors = scales::col_numeric(
palette = c(red_color, yellow_color, green_color),
domain = range(df_tabyl[1:4,2:5])
)
) %>%
fmt_percent(columns = -row_groups,
rows = everything()) %>%
summary_rows(
columns = -row_groups,
fns = list("Total" = "sum"),
formatter = fmt_percent
)
The coloring varies with your example b/c the col_numeric function maps the colors linearly along the three provided colors, and 11% is only 1/3 of the way between 0% and 33%. Not sure what approach you expect.
I am trying to create a 95% minimum convex polygon using sf in R. My code works fine as long as I only group my data on 1 variable, but when I group on two variables, the output loses its sf class and becomes a grouped_df instead.
Here is a toy dataset as an example
library(dplyr)
set.seed(12)
toy <- tibble::tibble(
ID = rep(c(1,2), each = 10),
year = rep(c(1,2), 10),
lat = runif(20, 1, 10),
long = runif(20, 1, 10)
) %>%
sf::st_as_sf(., coords = c("long", "lat"))
toy %>%
group_by(ID) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize() %>%
sf::st_convex_hull() %>%
class()
This gives the output I want. But when I try to group by two variables, the result loses the sf class.
toy %>%
group_by(ID, year) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize() %>%
sf::st_convex_hull() %>%
class
Is there something in my code keeping me from being able to group on two variables?
It's because the second summarize is regrouping on ID. You need a .groups = "keeps" there to pass the same grouping through and then convert back to an sf object. Alternatively you could create a grouping variable mutate(grp = paste0(ID, year)) and do group_by(grp) .
toy %>%
group_by(ID, year) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize(.groups = "keep") %>%
sf::st_convex_hull() %>%
st_sf()
I want to create a summary table for some dichotomous variables using the expss package. Since the variables are dichotomous, one of the two levels would the sufficient to "show the picture".
I tried to use the function tab_net_cell, but was not able to get the right results. Here is some example code with BrCa (Breast cancer) with 1 or 0. I only want to show the number of patients with but not without breast cancer.
df <- data.frame(BrCa = c(1,1,1,0,0,0,NA,NA,0,0))
df$group <- c(1,2,1,2,1,2,1,2,1,2)
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_net_cells("BrCa" = eq(1)) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
The simplest way is to filter resulted table:
df <- data.frame(BrCa = c(1,1,1,0,0,0,NA,NA,0,0))
df$group <- c(1,2,1,2,1,2,1,2,1,2)
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows") %>%
expss::where(grepl(1, row_labels))
Another way is to use mean and sum instead of cpct and cases:
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa*100) %>%
expss::tab_stat_mean(label = "%") %>%
expss::tab_stat_sum(label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
I would like to perform kmeans within groups and add to my data information about cluster number and center which an observation was assigned to (still, within groups so cluster 1 is not the same for group A and group B). I thought that I can pluck cluster assignment and centroid from kmeans and then maybe join these two with each other and finally, with original data. To do the former I wanted to add a row number to data frames with centers and then join by the number of cluster. But how can I add row number within nested data frames? The following code works well until the last, 'nested' mutate.
my_data <- data.frame(group = c(sample(c('A', 'B', 'C'), 20, replace = TRUE)), x = runif(100, 0, 10), y = runif(100, 0, 10))
my_data %>%
group_by(group) %>%
nest() %>%
mutate(km_cluster = map(data, ~kmeans(.x, 3) %>% pluck('cluster')),
km_centers = map(data, ~kmeans(.x, 3) %>% pluck('centers') %>% mutate(cluster = row_number())))
#Luke.sonnet provided an answer that works well with map, but interestingly not with map2, see below:
my_data %>%
group_by(group) %>%
nest() %>%
mutate(number = sample(3:7, 3)) %>%
mutate(km_cluster = map2(data, number, ~kmeans(.x, .y) %>% pluck('cluster')),
km_centers = map2(data, number, ~kmeans(.x, .y) %>% pluck('centers') %>% as_tibble() %>% mutate(cluster = row_number())))
Any ideas how to solve the issue in that case? And equally important, what is the cause of such behaviour?
The problem is that pluck() is returning a matrix. Cast to a tibble first and number differently.
library(tidyverse)
my_data <- data.frame(group = c(sample(c('A', 'B', 'C'), 20, replace = TRUE)), x = runif(100, 0, 10), y = runif(100, 0, 10))
my_data %>%
group_by(group) %>%
nest() %>%
mutate(number = sample(3:7, 3)) %>%
mutate(km_cluster = map2(data, number, ~kmeans(.x, .y) %>% pluck('cluster')),
km_centers = map2(data, number, ~kmeans(.x, .y) %>% pluck('centers') %>% as_tibble() %>% mutate(cluster = seq_len(nrow(.)))))
Note you can also do mutate(cluster = row_number(x)))) and this provides different numbers (note that just using row_number() uses the rows from the parent df). I think given kmeans that the matrix of centers is ordered row-wise by cluster number that the answer in the main chunk is correct.