Order Data table based on Category in R - r

I have a data table that I created in R to compare the percentage between a population and sample. Here is the script I created for the table:
team_pop <- team_new %>%
group_by(degree) %>%
count() %>%
ungroup() %>%
mutate(pop = n/sum(n)) %>%
arrange(desc(pop)) %>%
adorn_totals()
team_sample <- sample_final %>%
group_by(degree) %>%
count() %>%
ungroup() %>%
mutate(sam = n/sum(n)) %>%
arrange(desc(sam)) %>%
adorn_totals()
datatable(
team_pop %>%
select(-n) %>%
left_join(team_sample %>%
select(degree,
sam),
by = "degree"),
rownames = FALSE,
colnames = c(
"Degree"= "degree",
"Population" = "pop",
"Sample" = "sam"),
options = list(info = FALSE,
paging = FALSE,
searching = FALSE)
) %>%
formatPercentage(2, digits = 0) %>%
formatPercentage(3, digits = 0)
Here is what the data looks like:
Degree
Population
Sample
Medium
45%
43%
Low
35%
37%
High
20%
20%
But this is how I want me data to be ordered in my data table:
Degree
Population
Sample
High
20%
20%
Medium
45%
43%
Low
35%
37%
Just wondering if someone could please help me order my categories based on the Degree column from High to Low. I have tried the R function "sort" using the formula below, but it still orders my data based on the highest number first:
team_pop <- team_new %>%
group_by(degree) %>%
count() %>%
ungroup() %>%
mutate(pop = n/sum(n)) %>%
sort(degree, degreasing = FALSE) %>%
adorn_totals()
team_sample <- sample_final %>%
group_by(degree) %>%
count() %>%
ungroup() %>%
mutate(sam = n/sum(n)) %>%
sort(degree, degreasing = FALSE) %>%
adorn_totals()
datatable(
team_pop %>%
select(-n) %>%
left_join(team_sample %>%
select(degree,
sam),
by = "degree"),
rownames = FALSE,
colnames = c(
"Degree"= "degree",
"Population" = "pop",
"Sample" = "sam"),
options = list(info = FALSE,
paging = FALSE,
searching = FALSE)
) %>%
formatPercentage(2, digits = 0) %>%
formatPercentage(3, digits = 0)

Make your degree variable a factor and then arrange based on degree.
team_pop <- team_pop %>%
mutate(Degree=factor(Degree, levels = c("High", "Medium", "Low"))) %>%
arrange(Degree)

Related

Change row group labels in gt table (with superscript/subscript and line breaks). Customising row group labels in R

I have the following data and table:
library(gt)
library(dplyr)
a <- rnorm(21, mean = 112, sd =12)
colour <- rep(c("Blue", "Red", "Green"), 7)
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O]")) %>%
rename(cat = colour)
b <- rnorm(21, mean = 60, sd =12)
day <- rep(c("2", "4", "6"), 7)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O] Additition <br> (Days)")) %>%
rename(cat = day)
bind_rows(data, data2) %>%
group_by(grp) %>%
gt(rowname_col = "cat")
bind_rows(data, data2) %>%
group_by(grp) %>%
gt() %>%
tab_options(row_group.as_column = TRUE)
The row group labels appear literally as '[H<sub>2<\sub>O]', rather than [H2O] etc. It is likely that I am using HTML wrong and it needs to be used with another package/function. I have also tried using cols_label but doesn't recognise these as columns in the dataframe.
Is there also a way to have the row groups column vertically centered, rather than at the top where is currently is? How do you bold these row groups?
The html function won't work outside of a gt table, so you'll have to create the row groups using tab_row_group and add the html labels there.
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "color") %>%
rename(cat = colour)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "day") %>%
rename(cat = day)
bind_rows(data, data2) %>%
gt() %>%
tab_row_group(
label = html("[H<sub>2</sub>O]"),
rows = grp == "color"
) %>%
tab_row_group(
label = html("[H<sub>2</sub>O] Additition <br> (Days)"),
rows = grp == "day"
) %>%
cols_hide(grp)

How do I create a table in R with conditional formatting and row and column totals?

Are there any R packages that I use to replicate the table below -
I would like a table with conditional formatting for the table values but no conditional formatting on the row and column grand totals.
The code can be used to reproduce the values in the table along with the row and column grand totals -
library(tidyverse)
# vectors
dates <- rep(date_vec <- c(as.Date("2022-01-01"), as.Date("2022-02-01"), as.Date("2022-03-01")), 30)
row_groups <- c(rep("row_group1", 20), rep("row_group2", 30), rep("row_group3", 10), rep("row_group4", 30))
col_groups <- c(rep("col_group1", 10), rep("col_group2", 10), rep("col_group3", 30), rep("col_group4", 40))
# dataframe
df <- tibble(dates, row_groups, col_groups)
# column grand totals
col_group_total <- df %>%
group_by(dates, col_groups) %>%
count() %>%
group_by(col_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))
# row grand totals
row_group_total <- df %>%
group_by(dates, row_groups) %>%
count() %>%
group_by(row_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))%>%
ungroup()
# table values
group_total <- df %>%
group_by(dates, row_groups, col_groups) %>%
count() %>%
group_by(row_groups, col_groups) %>%
summarise(count = mean(n)) %>%
ungroup() %>%
mutate(pct = count/sum(count))%>%
ungroup()
red_color <- "#f4cccc"
yellow_color <- "#f3f0ce"
green_color <- "#d9ead3"
library(janitor); library(gt)
df %>%
tabyl(row_groups, col_groups) %>%
adorn_percentages("all") %>%
adorn_totals(c("col")) -> df_tabyl
gt(df_tabyl) %>%
data_color(columns = col_group1:col_group4,
colors = scales::col_numeric(
palette = c(red_color, yellow_color, green_color),
domain = range(df_tabyl[1:4,2:5])
)
) %>%
fmt_percent(columns = -row_groups,
rows = everything()) %>%
summary_rows(
columns = -row_groups,
fns = list("Total" = "sum"),
formatter = fmt_percent
)
The coloring varies with your example b/c the col_numeric function maps the colors linearly along the three provided colors, and 11% is only 1/3 of the way between 0% and 33%. Not sure what approach you expect.

Cross-column summary after `summary_rows` with gt

This is a toy example, so the numbers are meaningless, but how would I calculate the summary statistics for proportion in the summary_rows row of the table per group?
proportion is a row-wise calculation, so I can't sum/mean/sd/etc. For average proportion, for example, I want to calculate num[average] / items[average]. I can't figure out how to get a custom function to work for fns across groups.
exibble_a <-
exibble %>%
mutate(items = runif(8,10,20)) %>%
group_by(group) %>%
mutate(proportion = ifelse(is.na(items / num), 0, items / num)) %>%
ungroup() %>%
select(-c(fctr, date, time, datetime))
exibble_b =
exibble_a %>%
group_by(group) %>%
gt(rowname_col = "row", groupname_col = "group") %>%
fmt_missing(columns = everything()) %>%
fmt_percent(columns = proportion,
decimals = 2) %>%
summary_rows(groups = TRUE,
columns = c(num,items),
fns = list(
average = ~ mean(.,na.rm=TRUE),
total = ~ sum(.,na.rm=TRUE),
SD = ~ sd(.,na.rm=TRUE))) %>%
summary_rows(groups = TRUE,
columns = proportion,
fns = list(
average = ~ mean(exibble_a$items) / mean(exibble_a$num,
na.rm=TRUE)),
formatter = fmt_percent,
decimals = 2,
use_seps = TRUE)
exibble_b

R sf::st_convex_hull() losing sf class on data with multiple groups

I am trying to create a 95% minimum convex polygon using sf in R. My code works fine as long as I only group my data on 1 variable, but when I group on two variables, the output loses its sf class and becomes a grouped_df instead.
Here is a toy dataset as an example
library(dplyr)
set.seed(12)
toy <- tibble::tibble(
ID = rep(c(1,2), each = 10),
year = rep(c(1,2), 10),
lat = runif(20, 1, 10),
long = runif(20, 1, 10)
) %>%
sf::st_as_sf(., coords = c("long", "lat"))
toy %>%
group_by(ID) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize() %>%
sf::st_convex_hull() %>%
class()
This gives the output I want. But when I try to group by two variables, the result loses the sf class.
toy %>%
group_by(ID, year) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize() %>%
sf::st_convex_hull() %>%
class
Is there something in my code keeping me from being able to group on two variables?
It's because the second summarize is regrouping on ID. You need a .groups = "keeps" there to pass the same grouping through and then convert back to an sf object. Alternatively you could create a grouping variable mutate(grp = paste0(ID, year)) and do group_by(grp) .
toy %>%
group_by(ID, year) %>%
summarize(.groups = "keep") %>%
mutate(cent = sf::st_centroid(geometry)) %>%
sf::st_cast(to = "POINT") %>%
mutate(dist = sf::st_distance(geometry, cent, by_element = TRUE)) %>%
filter(dist <= quantile(dist, .95)) %>%
summarize(.groups = "keep") %>%
sf::st_convex_hull() %>%
st_sf()

How to data wrangle and barplot the proportion without undesired stripes

Please find the input data and expected output as screenshot below:
However, the current plot with the below code:
I feel, I made it too complicated. But I shared input data and expected data along with struggled code along the way. Could you please help us
Mainly there are 2 issues.
1. If mutate is used, undesired stripes appear on the plot
Summarize used, then it is not adding to 100%
2. How can we extract the top contributors
Both have been tried by us but stuck somewhere
# Input data
df <- tibble(
country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","web","web","Other","online","whatsapp",
"sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","shiny","online","Other","online","Other"
),
cust = rep(c("google","Apple","wallmart","pg"),8),
quantity = c(10,20,30,40,50,60,70,80,
90,100,15,25,35,45,55,65,
75,85,95,105,10,15,20,25,
30,35,40,45,50,55,60,65)
)
# Without Customer
df %>%
group_by(country,type) %>%
summarise(kpi_wo_cust = sum(quantity)) %>%
ungroup() -> df_wo_cust
# With Customer
df %>%
group_by(country,type,cust) %>%
summarise(kpi_cust = sum(quantity)) %>%
ungroup() -> df_cust
df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()
# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>%
group_by(country,type) %>%
mutate(kpi_cust_total = sum(kpi_cust),
per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
group_by(country) %>%
# In order to except from repeated counting, selecting unique()
mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4))
#
plt = df_aggr %>% ungroup() %>%#glimpse()
# In order to obtain theTop 2 customers (Major contributor) within country and type
# However, if this code is used, there is an error
# group_by(country, type) %>%
# nest() %>%
# mutate(top_cust = purrr::map_chr(data, function(x){
# x %>% arrange(desc(per_kpi_cust)) %>%
# top_n(2,per_kpi_cust) %>%
# summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
# pull(cust)
# })#,data = NULL
# ) %>%
# unnest(cols = data) %>%
group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot
# Summarize used, then it is not adding to 100%
mutate(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
#summarise(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
ggplot(aes(x = country,
y = avg_kpi_cust,
fill = type,
text = paste('<br>proportion: ', round(avg_kpi_cust,2), "%",
"<br>country:",country
))) +
geom_bar(stat = "identity"#, position=position_dodge()
) +
coord_flip() +
theme_bw()
ggplotly(plt)
The key was to use distinct() after mutate() instead of summarise()
Also, mean() was the wrong function used earlier instead of sum() which had resulted in incomplete barplot.
library(tidyverse)
library(plotly)
# Input data
df <- tibble(
country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","web","web","Other","online","whatsapp",
"sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","shiny","online","Other","online","Other"
),
cust = rep(c("google","Apple","wallmart","pg"),8),
quantity = c(10,20,30,40,50,60,70,80,
90,100,15,25,35,45,55,65,
75,85,95,105,10,15,20,25,
30,35,40,45,50,55,60,65)
)
# Without Customer
df %>%
group_by(country,type) %>%
summarise(kpi_wo_cust = sum(quantity)) %>%
ungroup() -> df_wo_cust
# With Customer
df %>%
group_by(country,type,cust) %>%
summarise(kpi_cust = sum(quantity)) %>%
ungroup() -> df_cust
df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()
# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>%
group_by(country,type) %>%
mutate(kpi_cust_total = sum(kpi_cust),
per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
group_by(country) %>%
# In order to except from repeated counting, selecting unique()
mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4))
plt = df_aggr %>% ungroup() %>%
# In order to diplay Top 2 customers (Major contributor) within country and type
group_by(country, type) %>%
nest() %>%
mutate(top_cust = purrr::map_chr(data, function(x){
x %>% arrange(desc(per_kpi_cust)) %>%
top_n(2,per_kpi_cust) %>%
summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
pull(Cust)
})) %>%
unnest(cols = data) %>%
group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot
# Summarize used, then it is not adding to 100%.
# So distinct was used
mutate(avg_kpi_cust = per_unq_kpi_cust %>% sum()) %>%
ungroup() %>%
distinct(country, type, .keep_all = T) %>%
ggplot(aes(x = country,
y = avg_kpi_cust,
fill = type,
text = top_cust
)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_bw()
ggplotly(plt, tooltip = "text")

Resources